diff options
Diffstat (limited to 'net/core/dev.c')
| -rw-r--r-- | net/core/dev.c | 12155 |
1 files changed, 9476 insertions, 2679 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index a3d8d44cb7f4..9094c0fb8c68 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * NET3 Protocol independent device support routines. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * NET3 Protocol independent device support routines. * * Derived from the non IP parts of dev.c 1.0.19 - * Authors: Ross Biro + * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * @@ -21,9 +17,9 @@ * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set - * to 2 if register_netdev gets called - * before net_dev_init & also removed a - * few lines of code in the process. + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. @@ -36,7 +32,7 @@ * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. - * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before @@ -46,7 +42,7 @@ * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. - * Alan Cox : Fixed nasty side effect of device close + * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() @@ -67,13 +63,13 @@ * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait - * indefinitely on dev->refcnt - * J Hadi Salim : - Backlog queue sampling + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ -#include <asm/uaccess.h> -#include <linux/bitops.h> +#include <linux/uaccess.h> +#include <linux/bitmap.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> @@ -81,7 +77,11 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/isolation.h> +#include <linux/sched/mm.h> +#include <linux/smpboot.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> @@ -92,16 +92,26 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> -#include <linux/notifier.h> +#include <linux/ethtool_netlink.h> #include <linux/skbuff.h> +#include <linux/kthread.h> +#include <linux/bpf.h> +#include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> +#include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> +#include <net/dsa.h> #include <net/dst.h> +#include <net/dst_metadata.h> +#include <net/gro.h> +#include <net/netdev_queues.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> @@ -118,6 +128,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> +#include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> @@ -125,65 +136,64 @@ #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> -#include <linux/pci.h> +#include <trace/events/qdisc.h> +#include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> #include <linux/hashtable.h> #include <linux/vmalloc.h> - +#include <linux/if_macvlan.h> +#include <linux/errqueue.h> +#include <linux/hrtimer.h> +#include <linux/netfilter_netdev.h> +#include <linux/crash_dump.h> +#include <linux/sctp.h> +#include <net/udp_tunnel.h> +#include <linux/net_namespace.h> +#include <linux/indirect_call_wrapper.h> +#include <net/devlink.h> +#include <linux/pm_runtime.h> +#include <linux/prandom.h> +#include <linux/once_lite.h> +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/types.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/rps.h> +#include <linux/phy_link_topology.h> + +#include "dev.h" +#include "devmem.h" #include "net-sysfs.h" -/* Instead of increasing this, you should create a hash table. */ -#define MAX_GRO_SKBS 8 - -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(ptype_lock); -static DEFINE_SPINLOCK(offload_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -struct list_head ptype_all __read_mostly; /* Taps */ -static struct list_head offload_base __read_mostly; -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading, or rcu_read_lock() - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_lock); +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); + +static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); -static unsigned int napi_gen_id; -static DEFINE_HASHTABLE(napi_hash, 8); - -seqcount_t devnet_rename_seq; +static unsigned int napi_gen_id = NR_CPUS; +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static inline void dev_base_seq_inc(struct net *net) { - while (++net->dev_base_seq == 0); + unsigned int val = net->dev_base_seq + 1; + + WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { - unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } @@ -193,33 +203,218 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *sd) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) { -#ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); -#endif + static_branch_enable(&use_backlog_threads_key); + return 0; } +early_param("thread_backlog_napi", setup_backlog_napi_threads); -static inline void rps_unlock(struct softnet_data *sd) +static bool use_backlog_threads(void) { -#ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); + return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void) +{ + return true; +} + #endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); + else + local_irq_save(*flags); +} + +static inline void backlog_lock_irq_disable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_lock_irq(&sd->input_pkt_queue.lock); + else + local_irq_disable(); +} + +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); + else + local_irq_restore(*flags); +} + +static inline void backlog_unlock_irq_enable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_unlock_irq(&sd->input_pkt_queue.lock); + else + local_irq_enable(); +} + +static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, + const char *name) +{ + struct netdev_name_node *name_node; + + name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + if (!name_node) + return NULL; + INIT_HLIST_NODE(&name_node->hlist); + name_node->dev = dev; + name_node->name = name; + return name_node; +} + +static struct netdev_name_node * +netdev_name_node_head_alloc(struct net_device *dev) +{ + struct netdev_name_node *name_node; + + name_node = netdev_name_node_alloc(dev, dev->name); + if (!name_node) + return NULL; + INIT_LIST_HEAD(&name_node->list); + return name_node; +} + +static void netdev_name_node_free(struct netdev_name_node *name_node) +{ + kfree(name_node); +} + +static void netdev_name_node_add(struct net *net, + struct netdev_name_node *name_node) +{ + hlist_add_head_rcu(&name_node->hlist, + dev_name_hash(net, name_node->name)); +} + +static void netdev_name_node_del(struct netdev_name_node *name_node) +{ + hlist_del_rcu(&name_node->hlist); +} + +static struct netdev_name_node *netdev_name_node_lookup(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry_rcu(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +bool netdev_name_in_use(struct net *net, const char *name) +{ + return netdev_name_node_lookup(net, name); +} +EXPORT_SYMBOL(netdev_name_in_use); + +int netdev_name_node_alt_create(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (name_node) + return -EEXIST; + name_node = netdev_name_node_alloc(dev, name); + if (!name_node) + return -ENOMEM; + netdev_name_node_add(net, name_node); + /* The node that holds dev->name acts as a head of per-device list. */ + list_add_tail_rcu(&name_node->list, &dev->name_node->list); + + return 0; +} + +static void netdev_name_node_alt_free(struct rcu_head *head) +{ + struct netdev_name_node *name_node = + container_of(head, struct netdev_name_node, rcu); + + kfree(name_node->name); + netdev_name_node_free(name_node); +} + +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + netdev_name_node_del(name_node); + list_del(&name_node->list); + call_rcu(&name_node->rcu, netdev_name_node_alt_free); +} + +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (!name_node) + return -ENOENT; + /* lookup might have found our primary name or a name belonging + * to another device. + */ + if (name_node == dev->name_node || name_node->dev != dev) + return -EINVAL; + + __netdev_name_node_alt_destroy(name_node); + return 0; +} + +static void netdev_name_node_alt_flush(struct net_device *dev) +{ + struct netdev_name_node *name_node, *tmp; + + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { + list_del(&name_node->list); + netdev_name_node_alt_free(&name_node->rcu); + } } /* Device list insertion */ static void list_netdevice(struct net_device *dev) { + struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); - write_lock_bh(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock_bh(&dev_base_lock); + + netdev_for_each_altname(dev, name_node) + netdev_name_node_add(net, name_node); + + /* We reserved the ifindex, this can't fail */ + WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } @@ -229,14 +424,20 @@ static void list_netdevice(struct net_device *dev) */ static void unlist_netdevice(struct net_device *dev) { + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + ASSERT_RTNL(); + xa_erase(&net->dev_by_index, dev->ifindex); + + netdev_for_each_altname(dev, name_node) + netdev_name_node_del(name_node); + /* Unlink dev from the device chain */ - write_lock_bh(&dev_base_lock); list_del_rcu(&dev->dev_list); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock_bh(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -252,16 +453,26 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * queue in the local softnet handler. */ -DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { + .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), +}; EXPORT_PER_CPU_SYMBOL(softnet_data); +/* Page_pool has a lockless array/stack to alloc/recycle pages. + * PP consumers must pay attention to run APIs in the appropriate context + * (e.g. NAPI context). + */ +DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; + #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ -static const unsigned short netdev_lock_type[] = - {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, @@ -277,22 +488,22 @@ static const unsigned short netdev_lock_type[] = ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; -static const char *const netdev_lock_name[] = - {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", - "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", - "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", - "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", - "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", - "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", - "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", - "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", - "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", - "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", - "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", - "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", - "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", - "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", - "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; @@ -332,16 +543,18 @@ static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } + static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { } #endif /******************************************************************************* + * + * Protocol management and registration routines + * + *******************************************************************************/ - Protocol management and registration routines - -*******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is @@ -361,10 +574,19 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { - if (pt->type == htons(ETH_P_ALL)) - return &ptype_all; - else - return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + if (pt->type == htons(ETH_P_ALL)) { + if (!pt->af_packet_net && !pt->dev) + return NULL; + + return pt->dev ? &pt->dev->ptype_all : + &pt->af_packet_net->ptype_all; + } + + if (pt->dev) + return &pt->dev->ptype_specific; + + return pt->af_packet_net ? &pt->af_packet_net->ptype_specific : + &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** @@ -384,6 +606,9 @@ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); + if (WARN_ON_ONCE(!head)) + return; + spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); @@ -408,6 +633,9 @@ void __dev_remove_pack(struct packet_type *pt) struct list_head *head = ptype_head(pt); struct packet_type *pt1; + if (!head) + return; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { @@ -444,286 +672,258 @@ void dev_remove_pack(struct packet_type *pt) EXPORT_SYMBOL(dev_remove_pack); -/** - * dev_add_offload - register offload handlers - * @po: protocol offload declaration +/******************************************************************************* * - * Add protocol offload handlers to the networking stack. The passed - * &proto_offload is linked into kernel lists and may not be freed until - * it has been removed from the kernel lists. + * Device Interface Subroutines * - * This call does not sleep therefore it can not - * guarantee all CPU's that are in middle of receiving packets - * will see the new offload handlers (until the next received packet). + *******************************************************************************/ + +/** + * dev_get_iflink - get 'iflink' value of a interface + * @dev: targeted interface + * + * Indicates the ifindex the interface is linked to. + * Physical interfaces have the same 'ifindex' and 'iflink' values. */ -void dev_add_offload(struct packet_offload *po) + +int dev_get_iflink(const struct net_device *dev) { - struct list_head *head = &offload_base; + if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) + return dev->netdev_ops->ndo_get_iflink(dev); - spin_lock(&offload_lock); - list_add_rcu(&po->list, head); - spin_unlock(&offload_lock); + return READ_ONCE(dev->ifindex); } -EXPORT_SYMBOL(dev_add_offload); +EXPORT_SYMBOL(dev_get_iflink); /** - * __dev_remove_offload - remove offload handler - * @po: packet offload declaration - * - * Remove a protocol offload handler that was previously added to the - * kernel offload handlers by dev_add_offload(). The passed &offload_type - * is removed from the kernel lists and can be freed or reused once this - * function returns. + * dev_fill_metadata_dst - Retrieve tunnel egress information. + * @dev: targeted interface + * @skb: The packet. * - * The packet type might still be in use by receivers - * and must not be freed until after all the CPU's have gone - * through a quiescent state. + * For better visibility of tunnel traffic OVS needs to retrieve + * egress tunnel information for a packet. Following API allows + * user to get this info. */ -void __dev_remove_offload(struct packet_offload *po) +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { - struct list_head *head = &offload_base; - struct packet_offload *po1; + struct ip_tunnel_info *info; - spin_lock(&offload_lock); + if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) + return -EINVAL; - list_for_each_entry(po1, head, list) { - if (po == po1) { - list_del_rcu(&po->list); - goto out; - } - } + info = skb_tunnel_info_unclone(skb); + if (!info) + return -ENOMEM; + if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) + return -EINVAL; - pr_warn("dev_remove_offload: %p not found\n", po); -out: - spin_unlock(&offload_lock); + return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); } -EXPORT_SYMBOL(__dev_remove_offload); +EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); -/** - * dev_remove_offload - remove packet offload handler - * @po: packet offload declaration - * - * Remove a packet offload handler that was previously added to the kernel - * offload handlers by dev_add_offload(). The passed &offload_type is - * removed from the kernel lists and can be freed or reused once this - * function returns. - * - * This call sleeps to guarantee that no CPU is looking at the packet - * type after return. - */ -void dev_remove_offload(struct packet_offload *po) +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) { - __dev_remove_offload(po); + int k = stack->num_paths++; - synchronize_net(); + if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + return NULL; + + return &stack->path[k]; } -EXPORT_SYMBOL(dev_remove_offload); -/****************************************************************************** +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack) +{ + const struct net_device *last_dev; + struct net_device_path_ctx ctx = { + .dev = dev, + }; + struct net_device_path *path; + int ret = 0; - Device Boot-time Settings Routines + memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); + stack->num_paths = 0; + while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { + last_dev = ctx.dev; + path = dev_fwd_path(stack); + if (!path) + return -1; -*******************************************************************************/ + memset(path, 0, sizeof(struct net_device_path)); + ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); + if (ret < 0) + return -1; -/* Boot time configuration table */ -static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; + if (WARN_ON_ONCE(last_dev == ctx.dev)) + return -1; + } -/** - * netdev_boot_setup_add - add new setup entry - * @name: name of the device - * @map: configured settings for the device - * - * Adds new setup entry to the dev_boot_setup list. The function - * returns 0 on error and 1 on success. This is a generic routine to - * all netdevices. - */ -static int netdev_boot_setup_add(char *name, struct ifmap *map) -{ - struct netdev_boot_setup *s; - int i; + if (!ctx.dev) + return ret; - s = dev_boot_setup; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { - memset(s[i].name, 0, sizeof(s[i].name)); - strlcpy(s[i].name, name, IFNAMSIZ); - memcpy(&s[i].map, map, sizeof(s[i].map)); - break; - } - } + path = dev_fwd_path(stack); + if (!path) + return -1; + path->type = DEV_PATH_ETHERNET; + path->dev = ctx.dev; - return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; + return ret; } +EXPORT_SYMBOL_GPL(dev_fill_forward_path); -/** - * netdev_boot_setup_check - check boot time settings - * @dev: the netdevice - * - * Check boot time settings for the device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found, 1 if they are. - */ -int netdev_boot_setup_check(struct net_device *dev) +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) { - struct netdev_boot_setup *s = dev_boot_setup; - int i; + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && - !strcmp(dev->name, s[i].name)) { - dev->irq = s[i].map.irq; - dev->base_addr = s[i].map.base_addr; - dev->mem_start = s[i].map.mem_start; - dev->mem_end = s[i].map.mem_end; - return 1; - } - } - return 0; -} -EXPORT_SYMBOL(netdev_boot_setup_check); + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + return NULL; +} -/** - * netdev_boot_base - get address from boot time settings - * @prefix: prefix for network device - * @unit: id for network device - * - * Check boot time settings for the base address of device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found. - */ -unsigned long netdev_boot_base(const char *prefix, int unit) +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct * +netdev_napi_by_id(struct net *net, unsigned int napi_id) { - const struct netdev_boot_setup *s = dev_boot_setup; - char name[IFNAMSIZ]; - int i; + struct napi_struct *napi; - sprintf(name, "%s%d", prefix, unit); + napi = napi_by_id(napi_id); + if (!napi) + return NULL; - /* - * If device already registered then return base of 1 - * to indicate not to probe for this interface - */ - if (__dev_get_by_name(&init_net, name)) - return 1; + if (WARN_ON_ONCE(!napi->dev)) + return NULL; + if (!net_eq(net, dev_net(napi->dev))) + return NULL; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) - if (!strcmp(name, s[i].name)) - return s[i].map.base_addr; - return 0; + return napi; } -/* - * Saves at boot time configured settings for any netdevice. +/** + * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it + * @net: the applicable net namespace + * @napi_id: ID of a NAPI of a target device + * + * Find a NAPI instance with @napi_id. Lock its device. + * The device must be in %NETREG_REGISTERED state for lookup to succeed. + * netdev_unlock() must be called to release it. + * + * Return: pointer to NAPI, its device with lock held, NULL if not found. */ -int __init netdev_boot_setup(char *str) +struct napi_struct * +netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) { - int ints[5]; - struct ifmap map; - - str = get_options(str, ARRAY_SIZE(ints), ints); - if (!str || !*str) - return 0; - - /* Save settings */ - memset(&map, 0, sizeof(map)); - if (ints[0] > 0) - map.irq = ints[1]; - if (ints[0] > 1) - map.base_addr = ints[2]; - if (ints[0] > 2) - map.mem_start = ints[3]; - if (ints[0] > 3) - map.mem_end = ints[4]; + struct napi_struct *napi; + struct net_device *dev; - /* Add new entry to the list */ - return netdev_boot_setup_add(str, &map); -} + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) { + rcu_read_unlock(); + return NULL; + } -__setup("netdev=", netdev_boot_setup); + dev = napi->dev; + dev_hold(dev); + rcu_read_unlock(); -/******************************************************************************* + dev = __netdev_put_lock(dev, net); + if (!dev) + return NULL; - Device Interface Subroutines + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (napi && napi->dev != dev) + napi = NULL; + rcu_read_unlock(); -*******************************************************************************/ + if (!napi) + netdev_unlock(dev); + return napi; +} /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The + * Find an interface by name. Must be called under RTNL semaphore. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); /** - * dev_get_by_name_rcu - find a device by its name - * @net: the applicable net namespace - * @name: name to find + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find * - * Find an interface by name. - * If the name is found a pointer to the device is returned. - * If the name is not found then %NULL is returned. - * The reference counters are not incremented so the caller must be - * careful with locks. The caller must hold RCU lock. + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry_rcu(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup_rcu(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); +/* Deprecated for new users, call netdev_get_by_name() instead */ +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_name); + /** - * dev_get_by_name - find a device by its name + * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has - * the usage count incremented and the caller must use dev_put() to + * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ - -struct net_device *dev_get_by_name(struct net *net, const char *name) +struct net_device *netdev_get_by_name(struct net *net, const char *name, + netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_name_rcu(net, name); + dev = dev_get_by_name(net, name); if (dev) - dev_hold(dev); - rcu_read_unlock(); + netdev_tracker_alloc(dev, tracker, gfp); return dev; } -EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex @@ -733,8 +933,7 @@ EXPORT_SYMBOL(dev_get_by_name); * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. + * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) @@ -774,63 +973,229 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) } EXPORT_SYMBOL(dev_get_by_index_rcu); +/* Deprecated for new users, call netdev_get_by_index() instead */ +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_index); /** - * dev_get_by_index - find a device by its ifindex + * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. + * netdev_put() to indicate they have finished with it. */ +struct net_device *netdev_get_by_index(struct net *net, int ifindex, + netdevice_tracker *tracker, gfp_t gfp) +{ + struct net_device *dev; -struct net_device *dev_get_by_index(struct net *net, int ifindex) + dev = dev_get_by_index(net, ifindex); + if (dev) + netdev_tracker_alloc(dev, tracker, gfp); + return dev; +} +EXPORT_SYMBOL(netdev_get_by_index); + +/** + * dev_get_by_napi_id - find a device by napi_id + * @napi_id: ID of the NAPI struct + * + * Search for an interface by NAPI ID. Returns %NULL if the device + * is not found or a pointer to the device. The device has not had + * its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ +struct net_device *dev_get_by_napi_id(unsigned int napi_id) +{ + struct napi_struct *napi; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!napi_id_valid(napi_id)) + return NULL; + + napi = napi_by_id(napi_id); + + return napi ? napi->dev : NULL; +} + +/* Release the held reference on the net_device, and if the net_device + * is still registered try to lock the instance lock. If device is being + * unregistered NULL will be returned (but the reference has been released, + * either way!) + * + * This helper is intended for locking net_device after it has been looked up + * using a lockless lookup helper. Lock prevents the instance from going away. + */ +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) +{ + netdev_lock(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +static struct net_device * +__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net) +{ + netdev_lock_ops_compat(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock_ops_compat(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +/** + * netdev_get_by_index_lock() - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. If a valid device + * with @ifindex is found it will be returned with netdev->lock held. + * netdev_unlock() must be called to release it. + * + * Return: pointer to a device with lock held, NULL if not found. + */ +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, ifindex); + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock(dev, net); +} + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock_ops_compat(dev, net); +} + +struct net_device * +netdev_xa_find_lock(struct net *net, struct net_device *dev, + unsigned long *index) +{ if (dev) + netdev_unlock(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } dev_hold(dev); - rcu_read_unlock(); - return dev; + rcu_read_unlock(); + + dev = __netdev_put_lock(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); } -EXPORT_SYMBOL(dev_get_by_index); + +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock_ops_compat(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock_ops_compat(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&netdev_rename_lock); + strscpy(name, dev->name, IFNAMSIZ); + } while (read_seqretry(&netdev_rename_lock, seq)); +} +EXPORT_IPV6_MOD_GPL(netdev_copy_name); /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. - * - * The use of raw_seqcount_begin() and cond_resched() before - * retrying is required as we want to give the writers a chance - * to complete when CONFIG_PREEMPT is not set. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; - unsigned int seq; + int ret; -retry: - seq = raw_seqcount_begin(&devnet_rename_seq); rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { - rcu_read_unlock(); - return -ENODEV; + ret = -ENODEV; + goto out; } - strcpy(name, dev->name); + netdev_copy_name(dev, name); + + ret = 0; +out: rcu_read_unlock(); - if (read_seqcount_retry(&devnet_rename_seq, seq)) { - cond_resched(); - goto retry; - } + return ret; +} - return 0; +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); } /** @@ -841,7 +1206,7 @@ retry: * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -853,26 +1218,38 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) { struct net_device *dev; ASSERT_RTNL(); for_each_netdev(net, dev) - if (dev->type == type) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } -EXPORT_SYMBOL(__dev_getfirstbyhwtype); +EXPORT_SYMBOL(dev_getbyhwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { @@ -891,51 +1268,52 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags_rcu - find any device with given flags - * @net: the applicable net namespace - * @if_flags: IFF_* values - * @mask: bitmask of bits in if_flags to check + * netdev_get_by_flags_rcu - find any device with given flags + * @net: the applicable net namespace + * @tracker: tracking object for the acquired reference + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check * - * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. Must be called inside - * rcu_read_lock(), and result refcount is unchanged. + * Search for any interface with the given flags. + * + * Context: rcu_read_lock() must be held. + * Returns: NULL if a device is not found or a pointer to the device. */ - -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, + unsigned short if_flags, unsigned short mask) { - struct net_device *dev, *ret; + struct net_device *dev; - ret = NULL; for_each_netdev_rcu(net, dev) { - if (((dev->flags ^ if_flags) & mask) == 0) { - ret = dev; - break; + if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) { + netdev_hold(dev, tracker, GFP_ATOMIC); + return dev; } } - return ret; + + return NULL; } -EXPORT_SYMBOL(dev_get_by_flags_rcu); +EXPORT_IPV6_MOD(netdev_get_by_flags_rcu); /** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to - * to allow sysfs to work. We also disallow any kind of + * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) { if (*name == '\0') return false; - if (strlen(name) >= IFNAMSIZ) + if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) return false; if (!strcmp(name, ".") || !strcmp(name, "..")) return false; while (*name) { - if (*name == '/' || isspace(*name)) + if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } @@ -947,7 +1325,7 @@ EXPORT_SYMBOL(dev_valid_name); * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string - * @buf: scratch buffer and result name string + * @res: result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses @@ -958,55 +1336,79 @@ EXPORT_SYMBOL(dev_valid_name); * Returns the number of the unit assigned or a negative errno code. */ -static int __dev_alloc_name(struct net *net, const char *name, char *buf) +static int __dev_alloc_name(struct net *net, const char *name, char *res) { int i = 0; const char *p; const int max_netdevices = 8*PAGE_SIZE; unsigned long *inuse; struct net_device *d; + char buf[IFNAMSIZ]; - p = strnchr(name, IFNAMSIZ-1, '%'); - if (p) { - /* - * Verify the string as this thing may have come from - * the user. There must be either one "%d" and no other "%" - * characters. - */ - if (p[1] != 'd' || strchr(p + 2, '%')) - return -EINVAL; + /* Verify the string as this thing may have come from the user. + * There must be one "%d" and no other "%" characters. + */ + p = strchr(name, '%'); + if (!p || p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; - /* Use one page as a bit array of possible slots */ - inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); - if (!inuse) - return -ENOMEM; + /* Use one page as a bit array of possible slots */ + inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); + if (!inuse) + return -ENOMEM; - for_each_netdev(net, d) { - if (!sscanf(d->name, name, &i)) + for_each_netdev(net, d) { + struct netdev_name_node *name_node; + + netdev_for_each_altname(d, name_node) { + if (!sscanf(name_node->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; - /* avoid cases where sscanf is not exact inverse of printf */ + /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); - if (!strncmp(buf, d->name, IFNAMSIZ)) - set_bit(i, inuse); + if (!strncmp(buf, name_node->name, IFNAMSIZ)) + __set_bit(i, inuse); } + if (!sscanf(d->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; - i = find_first_zero_bit(inuse, max_netdevices); - free_page((unsigned long) inuse); + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, d->name, IFNAMSIZ)) + __set_bit(i, inuse); } - if (buf != name) - snprintf(buf, IFNAMSIZ, name, i); - if (!__dev_get_by_name(net, buf)) - return i; + i = find_first_zero_bit(inuse, max_netdevices); + bitmap_free(inuse); + if (i == max_netdevices) + return -ENFILE; - /* It is possible to run out of possible slots - * when the name is long and there isn't enough space left - * for the digits, or if all bits are used. - */ - return -ENFILE; + /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ + strscpy(buf, name, IFNAMSIZ); + snprintf(res, IFNAMSIZ, buf, i); + return i; +} + +/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */ +static int dev_prep_valid_name(struct net *net, struct net_device *dev, + const char *want_name, char *out_name, + int dup_errno) +{ + if (!dev_valid_name(want_name)) + return -EINVAL; + + if (strchr(want_name, '%')) + return __dev_alloc_name(net, want_name, out_name); + + if (netdev_name_in_use(net, want_name)) + return -dup_errno; + if (out_name != want_name) + strscpy(out_name, want_name, IFNAMSIZ); + return 0; } /** @@ -1025,107 +1427,65 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) int dev_alloc_name(struct net_device *dev, const char *name) { - char buf[IFNAMSIZ]; - struct net *net; - int ret; - - BUG_ON(!dev_net(dev)); - net = dev_net(dev); - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); - return ret; + return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE); } EXPORT_SYMBOL(dev_alloc_name); -static int dev_alloc_name_ns(struct net *net, - struct net_device *dev, - const char *name) -{ - char buf[IFNAMSIZ]; - int ret; - - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); - return ret; -} - -static int dev_get_valid_name(struct net *net, - struct net_device *dev, +static int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { - BUG_ON(!net); - - if (!dev_valid_name(name)) - return -EINVAL; - - if (strchr(name, '%')) - return dev_alloc_name_ns(net, dev, name); - else if (__dev_get_by_name(net, name)) - return -EEXIST; - else if (dev->name != name) - strlcpy(dev->name, name, IFNAMSIZ); + int ret; - return 0; + ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST); + return ret < 0 ? ret : 0; } -/** - * dev_change_name - change name of a device - * @dev: device - * @newname: name (or format string) must be at least IFNAMSIZ - * - * Change name of a device, can pass format strings "eth%d". - * for wildcarding. - */ -int dev_change_name(struct net_device *dev, const char *newname) +int netif_change_name(struct net_device *dev, const char *newname) { + struct net *net = dev_net(dev); + unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; - struct net *net; - - ASSERT_RTNL(); - BUG_ON(!dev_net(dev)); - net = dev_net(dev); - if (dev->flags & IFF_UP) - return -EBUSY; + ASSERT_RTNL_NET(net); - write_seqcount_begin(&devnet_rename_seq); - - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { - write_seqcount_end(&devnet_rename_seq); + if (!strncmp(newname, dev->name, IFNAMSIZ)) return 0; - } memcpy(oldname, dev->name, IFNAMSIZ); + write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); - if (err < 0) { - write_seqcount_end(&devnet_rename_seq); + write_sequnlock_bh(&netdev_rename_lock); + + if (err < 0) return err; - } + + if (oldname[0] && !strchr(oldname, '%')) + netdev_info(dev, "renamed from %s%s\n", oldname, + dev->flags & IFF_UP ? " (while UP)" : ""); + + old_assign_type = dev->name_assign_type; + WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); - write_seqcount_end(&devnet_rename_seq); + write_sequnlock_bh(&netdev_rename_lock); + WRITE_ONCE(dev->name_assign_type, old_assign_type); return ret; } - write_seqcount_end(&devnet_rename_seq); + netdev_adjacent_rename_links(dev, oldname); - write_lock_bh(&dev_base_lock); - hlist_del_rcu(&dev->name_hlist); - write_unlock_bh(&dev_base_lock); + netdev_name_node_del(dev->name_node); - synchronize_rcu(); + synchronize_net(); - write_lock_bh(&dev_base_lock); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); - write_unlock_bh(&dev_base_lock); + netdev_name_node_add(net, dev->name_node); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @@ -1134,50 +1494,71 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; - write_seqcount_begin(&devnet_rename_seq); + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); + memcpy(oldname, newname, IFNAMSIZ); + WRITE_ONCE(dev->name_assign_type, old_assign_type); + old_assign_type = NET_NAME_RENAMED; goto rollback; } else { - pr_err("%s: name change rollback failed: %d\n", - dev->name, ret); + netdev_err(dev, "name change rollback failed: %d\n", + ret); } } return err; } -/** - * dev_set_alias - change ifalias of a device - * @dev: device - * @alias: name up to IFALIASZ - * @len: limit of bytes to copy from info - * - * Set ifalias for a device, - */ -int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +int netif_set_alias(struct net_device *dev, const char *alias, size_t len) { - char *new_ifalias; - - ASSERT_RTNL(); + struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; - if (!len) { - kfree(dev->ifalias); - dev->ifalias = NULL; - return 0; + if (len) { + new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); + if (!new_alias) + return -ENOMEM; + + memcpy(new_alias->ifalias, alias, len); + new_alias->ifalias[len] = 0; } - new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); - if (!new_ifalias) - return -ENOMEM; - dev->ifalias = new_ifalias; + mutex_lock(&ifalias_mutex); + new_alias = rcu_replace_pointer(dev->ifalias, new_alias, + mutex_is_locked(&ifalias_mutex)); + mutex_unlock(&ifalias_mutex); + + if (new_alias) + kfree_rcu(new_alias, rcuhead); - strlcpy(dev->ifalias, alias, len+1); return len; } +/** + * dev_get_alias - get ifalias of a device + * @dev: device + * @name: buffer to store name of ifalias + * @len: size of buffer + * + * get ifalias for a device. Caller must make sure dev cannot go + * away, e.g. rcu read lock or own a reference count to device. + */ +int dev_get_alias(const struct net_device *dev, char *name, size_t len) +{ + const struct dev_ifalias *alias; + int ret = 0; + + rcu_read_lock(); + alias = rcu_dereference(dev->ifalias); + if (alias) + ret = snprintf(name, len, "%s", alias->ifalias); + rcu_read_unlock(); + + return ret; +} /** * netdev_features_change - device changes features @@ -1191,26 +1572,43 @@ void netdev_features_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_features_change); +void netif_state_change(struct net_device *dev) +{ + netdev_ops_assert_locked_or_invisible(dev); + + if (dev->flags & IFF_UP) { + struct netdev_notifier_change_info change_info = { + .info.dev = dev, + }; + + call_netdevice_notifiers_info(NETDEV_CHANGE, + &change_info.info); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); + } +} + /** - * netdev_state_change - device changes state - * @dev: device to cause notification + * __netdev_notify_peers - notify network peers about existence of @dev, + * to be called when rtnl lock is already held. + * @dev: network device * - * Called to indicate a device has changed state. This function calls - * the notifier chains for netdev_chain and sends a NEWLINK message - * to the routing socket. + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. */ -void netdev_state_change(struct net_device *dev) +void __netdev_notify_peers(struct net_device *dev) { - if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); - } + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); } -EXPORT_SYMBOL(netdev_state_change); +EXPORT_SYMBOL(__netdev_notify_peers); /** - * netdev_notify_peers - notify network peers about existence of @dev - * @dev: network device + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when @@ -1221,47 +1619,75 @@ EXPORT_SYMBOL(netdev_state_change); void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); -static int __dev_open(struct net_device *dev) +static int napi_threaded_poll(void *data); + +static int napi_kthread_create(struct napi_struct *n) +{ + int err = 0; + + /* Create and wake up the kthread once to put it in + * TASK_INTERRUPTIBLE mode to avoid the blocked task + * warning and work with loadavg. + */ + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", + n->dev->name, n->napi_id); + if (IS_ERR(n->thread)) { + err = PTR_ERR(n->thread); + pr_err("kthread_run failed with err %d\n", err); + n->thread = NULL; + } + + return err; +} + +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); + dev_addr_check(dev); - if (!netif_device_present(dev)) - return -ENODEV; + if (!netif_device_present(dev)) { + /* may be detached because parent is runtime-suspended */ + if (dev->dev.parent) + pm_runtime_resume(dev->dev.parent); + if (!netif_device_present(dev)) + return -ENODEV; + } /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ - netpoll_rx_disable(dev); + netpoll_poll_disable(dev); - ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; set_bit(__LINK_STATE_START, &dev->state); + netdev_ops_assert_locked(dev); + if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); - netpoll_rx_enable(dev); + netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { - dev->flags |= IFF_UP; - net_dmaengine_get(); + netif_set_up(dev, true); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1270,44 +1696,34 @@ static int __dev_open(struct net_device *dev) return ret; } -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev) +int netif_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; - ret = __dev_open(dev); + ret = __dev_open(dev, extack); if (ret < 0) return ret; - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; } -EXPORT_SYMBOL(dev_open); -static int __dev_close_many(struct list_head *head) +static void __dev_close_many(struct list_head *head) { struct net_device *dev; ASSERT_RTNL(); might_sleep(); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { + /* Temporarily disable netpoll until the interface is down */ + netpoll_poll_disable(dev); + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1318,12 +1734,12 @@ static int __dev_close_many(struct list_head *head) * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* @@ -1333,243 +1749,524 @@ static int __dev_close_many(struct list_head *head) * We allow it to be called even after a DETACH hot-plug * event. */ + + netdev_ops_assert_locked(dev); + if (ops->ndo_stop) ops->ndo_stop(dev); - dev->flags &= ~IFF_UP; - net_dmaengine_put(); + netif_set_up(dev, false); + netpoll_poll_enable(dev); } - - return 0; } -static int __dev_close(struct net_device *dev) +static void __dev_close(struct net_device *dev) { - int retval; LIST_HEAD(single); - /* Temporarily disable netpoll until the interface is down */ - netpoll_rx_disable(dev); - - list_add(&dev->unreg_list, &single); - retval = __dev_close_many(&single); + list_add(&dev->close_list, &single); + __dev_close_many(&single); list_del(&single); - - netpoll_rx_enable(dev); - return retval; } -static int dev_close_many(struct list_head *head) +void netif_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; - LIST_HEAD(tmp_list); - list_for_each_entry_safe(dev, tmp, head, unreg_list) + /* Remove the devices that don't need to be closed */ + list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) - list_move(&dev->unreg_list, &tmp_list); + list_del_init(&dev->close_list); __dev_close_many(head); - list_for_each_entry(dev, head, unreg_list) { - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + list_for_each_entry_safe(dev, tmp, head, close_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_DOWN, dev); + if (unlink) + list_del_init(&dev->close_list); } - - /* rollback_registered_many needs the complete original list */ - list_splice(&tmp_list, head); - return 0; } +EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL"); -/** - * dev_close - shutdown an interface. - * @dev: device to shutdown - * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. - */ -int dev_close(struct net_device *dev) +void netif_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); - /* Block netpoll rx while the interface is going down */ - netpoll_rx_disable(dev); - - list_add(&dev->unreg_list, &single); - dev_close_many(&single); + list_add(&dev->close_list, &single); + netif_close_many(&single, true); list_del(&single); - - netpoll_rx_enable(dev); } - return 0; } -EXPORT_SYMBOL(dev_close); +EXPORT_SYMBOL(netif_close); + +void netif_disable_lro(struct net_device *dev) +{ + struct net_device *lower_dev; + struct list_head *iter; + + dev->wanted_features &= ~NETIF_F_LRO; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_LRO)) + netdev_WARN(dev, "failed to disable LRO!\n"); + netdev_for_each_lower_dev(dev, lower_dev, iter) { + netdev_lock_ops(lower_dev); + netif_disable_lro(lower_dev); + netdev_unlock_ops(lower_dev); + } +} +EXPORT_IPV6_MOD(netif_disable_lro); /** - * dev_disable_lro - disable Large Receive Offload on a device + * dev_disable_gro_hw - disable HW Generic Receive Offload on a device * @dev: device * - * Disable Large Receive Offload (LRO) on a net device. Must be - * called under RTNL. This is needed if received packets may be - * forwarded to another interface. + * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be + * called under RTNL. This is needed if Generic XDP is installed on + * the device. */ -void dev_disable_lro(struct net_device *dev) +static void dev_disable_gro_hw(struct net_device *dev) { - /* - * If we're trying to disable lro on a vlan device - * use the underlying physical device instead - */ - if (is_vlan_dev(dev)) - dev = vlan_dev_real_dev(dev); - - dev->wanted_features &= ~NETIF_F_LRO; + dev->wanted_features &= ~NETIF_F_GRO_HW; netdev_update_features(dev); - if (unlikely(dev->features & NETIF_F_LRO)) - netdev_WARN(dev, "failed to disable LRO!\n"); + if (unlikely(dev->features & NETIF_F_GRO_HW)) + netdev_WARN(dev, "failed to disable GRO_HW!\n"); +} + +const char *netdev_cmd_to_name(enum netdev_cmd cmd) +{ +#define N(val) \ + case NETDEV_##val: \ + return "NETDEV_" __stringify(val); + switch (cmd) { + N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) + N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) + N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) + N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) + N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) + N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) + N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) + N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) + N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) + N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) + N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) + N(XDP_FEAT_CHANGE) + } +#undef N + return "UNKNOWN_NETDEV_EVENT"; } -EXPORT_SYMBOL(dev_disable_lro); +EXPORT_SYMBOL_GPL(netdev_cmd_to_name); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - netdev_notifier_info_init(&info, dev); return nb->notifier_call(nb, val, &info); } +static int call_netdevice_register_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + int err; + + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + return err; + + if (!(dev->flags & IFF_UP)) + return 0; + + call_netdevice_notifier(nb, NETDEV_UP, dev); + return 0; +} + +static void call_netdevice_unregister_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +} + +static int call_netdevice_register_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + netdev_lock_ops(dev); + err = call_netdevice_register_notifiers(nb, dev); + netdev_unlock_ops(dev); + if (err) + goto rollback; + } + return 0; + +rollback: + for_each_netdev_continue_reverse(net, dev) + call_netdevice_unregister_notifiers(nb, dev); + return err; +} + +static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + + for_each_netdev(net, dev) + call_netdevice_unregister_notifiers(nb, dev); +} + static int dev_boot_phase = 1; /** - * register_netdevice_notifier - register a network notifier block - * @nb: notifier + * register_netdevice_notifier - register a network notifier block + * @nb: notifier * - * Register a notifier to be called when network device events occur. - * The notifier passed is linked into the kernel structures and must - * not be reused until it has been unregistered. A negative errno code - * is returned on a failure. + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. * - * When registered all registration and up events are replayed - * to the new notifier to allow device to have a race free - * view of the network device list. + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; - struct net_device *last; struct net *net; int err; + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); + + /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - call_netdevice_notifier(nb, NETDEV_UP, dev); - } + __rtnl_net_lock(net); + err = call_netdevice_register_net_notifiers(nb, net); + __rtnl_net_unlock(net); + if (err) + goto rollback; } unlock: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; rollback: - last = dev; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev == last) - goto outroll; - - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } + for_each_net_continue_reverse(net) { + __rtnl_net_lock(net); + call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); } -outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier); /** - * unregister_netdevice_notifier - unregister a network notifier block - * @nb: notifier + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier * - * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the - * kernel structures and may then be reused. A negative errno code - * is returned on a failure. + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. * - * After unregistering unregister and down device events are synthesized - * for all devices on the device list to the removed notifier to remove - * the need for special case cleanup code. + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; struct net *net; int err; + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } + __rtnl_net_lock(net); + call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); } + unlock: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); +static int __register_netdevice_notifier_net(struct net *net, + struct notifier_block *nb, + bool ignore_call_fail) +{ + int err; + + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + return err; + if (dev_boot_phase) + return 0; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err && !ignore_call_fail) + goto chain_unregister; + + return 0; + +chain_unregister: + raw_notifier_chain_unregister(&net->netdev_chain, nb); + return err; +} + +static int __unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + return err; + + call_netdevice_unregister_net_notifiers(nb, net); + return 0; +} + +/** + * register_netdevice_notifier_net - register a per-netns network notifier block + * @net: network namespace + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_net_lock(net); + err = __register_netdevice_notifier_net(net, nb, false); + rtnl_net_unlock(net); + + return err; +} +EXPORT_SYMBOL(register_netdevice_notifier_net); + +/** + * unregister_netdevice_notifier_net - unregister a per-netns + * network notifier block + * @net: network namespace + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier_net(). The notifier is unlinked from the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + rtnl_net_lock(net); + err = __unregister_netdevice_notifier_net(net, nb); + rtnl_net_unlock(net); + + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); + +static void __move_netdevice_notifier_net(struct net *src_net, + struct net *dst_net, + struct notifier_block *nb) +{ + __unregister_netdevice_notifier_net(src_net, nb); + __register_netdevice_notifier_net(dst_net, nb, true); +} + +static void rtnl_net_dev_lock(struct net_device *dev) +{ + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); +} + +int register_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_net_dev_lock(dev); + err = __register_netdevice_notifier_net(dev_net(dev), nb, false); + if (!err) { + nn->nb = nb; + list_add(&nn->list, &dev->net_notifier_list); + } + rtnl_net_dev_unlock(dev); + + return err; +} +EXPORT_SYMBOL(register_netdevice_notifier_dev_net); + +int unregister_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_net_dev_lock(dev); + list_del(&nn->list); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_net_dev_unlock(dev); + + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); + +static void move_netdevice_notifiers_dev_net(struct net_device *dev, + struct net *net) +{ + struct netdev_net_notifier *nn; + + list_for_each_entry(nn, &dev->net_notifier_list, list) + __move_netdevice_notifier_net(dev_net(dev), net, nn->nb); +} + /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function - * @dev: net_device pointer passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ -int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, +int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { + struct net *net = dev_net(info->dev); + int ret; + ASSERT_RTNL(); - netdev_notifier_info_init(info, dev); + + /* Run per-netns notifier block chain first, then run the global one. + * Hopefully, one day, the global one is going to be removed after + * all notifier block registrators get converted to be per-netns. + */ + ret = raw_notifier_call_chain(&net->netdev_chain, val, info); + if (ret & NOTIFY_STOP_MASK) + return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } -EXPORT_SYMBOL(call_netdevice_notifiers_info); + +/** + * call_netdevice_notifiers_info_robust - call per-netns notifier blocks + * for and rollback on error + * @val_up: value passed unmodified to notifier function + * @val_down: value passed unmodified to the notifier function when + * recovering from an error on @val_up + * @info: notifier information data + * + * Call all per-netns network notifier blocks, but not notifier blocks on + * the global notifier chain. Parameters and return value are as for + * raw_notifier_call_chain_robust(). + */ + +static int +call_netdevice_notifiers_info_robust(unsigned long val_up, + unsigned long val_down, + struct netdev_notifier_info *info) +{ + struct net *net = dev_net(info->dev); + + ASSERT_RTNL(); + + return raw_notifier_call_chain_robust(&net->netdev_chain, + val_up, val_down, info); +} + +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} /** * call_netdevice_notifiers - call all network notifier blocks @@ -1582,81 +2279,160 @@ EXPORT_SYMBOL(call_netdevice_notifiers_info); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; - - return call_netdevice_notifiers_info(val, dev, &info); + return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); -static struct static_key netstamp_needed __read_mostly; -#ifdef HAVE_JUMP_LABEL -/* We are not allowed to call static_key_slow_dec() from irq context - * If net_disable_timestamp() is called from irq context, defer the - * static_key_slow_dec() calls. +/** + * call_netdevice_notifiers_mtu - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @arg: additional u32 argument passed to the notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). */ +static int call_netdevice_notifiers_mtu(unsigned long val, + struct net_device *dev, u32 arg) +{ + struct netdev_notifier_info_ext info = { + .info.dev = dev, + .ext.mtu = arg, + }; + + BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); + + return call_netdevice_notifiers_info(val, &info.info); +} + +#ifdef CONFIG_NET_INGRESS +static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); + +void net_inc_ingress_queue(void) +{ + static_branch_inc(&ingress_needed_key); +} +EXPORT_SYMBOL_GPL(net_inc_ingress_queue); + +void net_dec_ingress_queue(void) +{ + static_branch_dec(&ingress_needed_key); +} +EXPORT_SYMBOL_GPL(net_dec_ingress_queue); +#endif + +#ifdef CONFIG_NET_EGRESS +static DEFINE_STATIC_KEY_FALSE(egress_needed_key); + +void net_inc_egress_queue(void) +{ + static_branch_inc(&egress_needed_key); +} +EXPORT_SYMBOL_GPL(net_inc_egress_queue); + +void net_dec_egress_queue(void) +{ + static_branch_dec(&egress_needed_key); +} +EXPORT_SYMBOL_GPL(net_dec_egress_queue); +#endif + +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); +EXPORT_SYMBOL(tcf_sw_enabled_key); +#endif + +DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +EXPORT_SYMBOL(netstamp_needed_key); +#ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; +static atomic_t netstamp_wanted; +static void netstamp_clear(struct work_struct *work) +{ + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + int wanted; + + wanted = atomic_add_return(deferred, &netstamp_wanted); + if (wanted > 0) + static_branch_enable(&netstamp_needed_key); + else + static_branch_disable(&netstamp_needed_key); +} +static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { -#ifdef HAVE_JUMP_LABEL - int deferred = atomic_xchg(&netstamp_needed_deferred, 0); +#ifdef CONFIG_JUMP_LABEL + int wanted = atomic_read(&netstamp_wanted); - if (deferred) { - while (--deferred) - static_key_slow_dec(&netstamp_needed); - return; + while (wanted > 0) { + if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1)) + return; } + atomic_inc(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else + static_branch_inc(&netstamp_needed_key); #endif - static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { -#ifdef HAVE_JUMP_LABEL - if (in_interrupt()) { - atomic_inc(&netstamp_needed_deferred); - return; +#ifdef CONFIG_JUMP_LABEL + int wanted = atomic_read(&netstamp_wanted); + + while (wanted > 1) { + if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1)) + return; } + atomic_dec(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else + static_branch_dec(&netstamp_needed_key); #endif - static_key_slow_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { - skb->tstamp.tv64 = 0; - if (static_key_false(&netstamp_needed)) - __net_timestamp(skb); + skb->tstamp = 0; + skb->tstamp_type = SKB_CLOCK_REALTIME; + if (static_branch_unlikely(&netstamp_needed_key)) + skb->tstamp = ktime_get_real(); } -#define net_timestamp_check(COND, SKB) \ - if (static_key_false(&netstamp_needed)) { \ - if ((COND) && !(SKB)->tstamp.tv64) \ - __net_timestamp(SKB); \ - } \ +#define net_timestamp_check(COND, SKB) \ + if (static_branch_unlikely(&netstamp_needed_key)) { \ + if ((COND) && !(SKB)->tstamp) \ + (SKB)->tstamp = ktime_get_real(); \ + } \ -static inline bool is_skb_forwardable(struct net_device *dev, - struct sk_buff *skb) +bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { - unsigned int len; + return __is_skb_forwardable(dev, skb, true); +} +EXPORT_SYMBOL_GPL(is_skb_forwardable); - if (!(dev->flags & IFF_UP)) - return false; +static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, + bool check_mtu) +{ + int ret = ____dev_forward_skb(dev, skb, check_mtu); - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len <= len) - return true; + if (likely(!ret)) { + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } - /* if TSO is enabled, we don't care about the length as the packet - * could be forwarded without being segmented before - */ - if (skb_is_gso(skb)) - return true; + return ret; +} - return false; +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, true); } +EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif @@ -1678,41 +2454,43 @@ static inline bool is_skb_forwardable(struct net_device *dev, */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, GFP_ATOMIC)) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - } - - if (unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - skb_scrub_packet(skb); - skb->protocol = eth_type_trans(skb, dev); - - /* eth_type_trans() can set pkt_type. - * clear pkt_type _after_ calling eth_type_trans() - */ - skb->pkt_type = PACKET_HOST; - - return netif_rx(skb); + return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); -static inline int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); +} + +static int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; - atomic_inc(&skb->users); + refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline void deliver_ptype_list_skb(struct sk_buff *skb, + struct packet_type **pt, + struct net_device *orig_dev, + __be16 type, + struct list_head *ptype_list) +{ + struct packet_type *ptype, *pt_prev = *pt; + + list_for_each_entry_rcu(ptype, ptype_list, list) { + if (ptype->type != type) + continue; + if (unlikely(pt_prev)) + deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + *pt = pt_prev; +} + static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) @@ -1726,59 +2504,93 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) return false; } +/** + * dev_nit_active_rcu - return true if any network interface taps are in use + * + * The caller must hold the RCU lock + * + * @dev: network device to check for the presence of taps + */ +bool dev_nit_active_rcu(const struct net_device *dev) +{ + /* Callers may hold either RCU or RCU BH lock */ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + return !list_empty(&dev_net(dev)->ptype_all) || + !list_empty(&dev->ptype_all); +} +EXPORT_SYMBOL_GPL(dev_nit_active_rcu); + /* * Support routine. Sends outgoing frames to any network * taps currently in use. */ -static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct packet_type *ptype; + struct packet_type *ptype, *pt_prev = NULL; + struct list_head *ptype_list; struct sk_buff *skb2 = NULL; - struct packet_type *pt_prev = NULL; rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { + ptype_list = &dev_net_rcu(dev)->ptype_all; +again: + list_for_each_entry_rcu(ptype, ptype_list, list) { + if (READ_ONCE(ptype->ignore_outgoing)) + continue; + /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ - if ((ptype->dev == dev || !ptype->dev) && - (!skb_loop_sk(ptype, skb))) { - if (pt_prev) { - deliver_skb(skb2, pt_prev, skb->dev); - pt_prev = ptype; - continue; - } + if (skb_loop_sk(ptype, skb)) + continue; - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - break; + if (unlikely(pt_prev)) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } - net_timestamp_set(skb2); + /* need to clone skb, done only once */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out_unlock; - /* skb->nh should be correctly - set by sender, so that the second statement is - just protection against buggy protocols. - */ - skb_reset_mac_header(skb2); - - if (skb_network_header(skb2) < skb2->data || - skb_network_header(skb2) > skb_tail_pointer(skb2)) { - net_crit_ratelimited("protocol %04x is buggy, dev %s\n", - ntohs(skb2->protocol), - dev->name); - skb_reset_network_header(skb2); - } + net_timestamp_set(skb2); - skb2->transport_header = skb2->network_header; - skb2->pkt_type = PACKET_OUTGOING; - pt_prev = ptype; + /* skb->nh should be correctly + * set by sender, so that the second statement is + * just protection against buggy protocols. + */ + skb_reset_mac_header(skb2); + + if (skb_network_header(skb2) < skb2->data || + skb_network_header(skb2) > skb_tail_pointer(skb2)) { + net_crit_ratelimited("protocol %04x is buggy, dev %s\n", + ntohs(skb2->protocol), + dev->name); + skb_reset_network_header(skb2); } + + skb2->transport_header = skb2->network_header; + skb2->pkt_type = PACKET_OUTGOING; + pt_prev = ptype; + } + + if (ptype_list != &dev->ptype_all) { + ptype_list = &dev->ptype_all; + goto again; + } +out_unlock: + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); } - if (pt_prev) - pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change @@ -1800,7 +2612,7 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { - pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); + netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } @@ -1811,79 +2623,152 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { - pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", - i, q); + netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", + i, q); netdev_set_prio_tc_map(dev, i, 0); } } } +int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) +{ + if (dev->num_tc) { + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + int i; + + /* walk through the TCs and see if it falls into any of them */ + for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { + if ((txq - tc->offset) < tc->count) + return i; + } + + /* didn't find it, just return -1 to indicate no match */ + return -1; + } + + return 0; +} +EXPORT_SYMBOL(netdev_txq_to_tc); + #ifdef CONFIG_XPS +static struct static_key xps_needed __read_mostly; +static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) -static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, - int cpu, u16 index) +static bool remove_xps_queue(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; - if (dev_maps) - map = xmap_dereference(dev_maps->cpu_map[cpu]); + map = xmap_dereference(dev_maps->attr_map[tci]); + if (!map) + return false; - for (pos = 0; map && pos < map->len; pos++) { - if (map->queues[pos] == index) { - if (map->len > 1) { - map->queues[pos] = map->queues[--map->len]; - } else { - RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); - kfree_rcu(map, rcu); - map = NULL; - } + for (pos = map->len; pos--;) { + if (map->queues[pos] != index) + continue; + + if (map->len > 1) { + map->queues[pos] = map->queues[--map->len]; break; } + + if (old_maps) + RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); + kfree_rcu(map, rcu); + return false; } - return map; + return true; } -static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +static bool remove_xps_queue_cpu(struct net_device *dev, + struct xps_dev_maps *dev_maps, + int cpu, u16 offset, u16 count) { - struct xps_dev_maps *dev_maps; - int cpu, i; + int num_tc = dev_maps->num_tc; bool active = false; + int tci; - mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); - - if (!dev_maps) - goto out_no_maps; + for (tci = cpu * num_tc; num_tc--; tci++) { + int i, j; - for_each_possible_cpu(cpu) { - for (i = index; i < dev->num_tx_queues; i++) { - if (!remove_xps_queue(dev_maps, cpu, i)) + for (i = count, j = offset; i--; j++) { + if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } - if (i == dev->num_tx_queues) - active = true; + + active |= i < 0; } - if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); + return active; +} + +static void reset_xps_maps(struct net_device *dev, + struct xps_dev_maps *dev_maps, + enum xps_map_type type) +{ + static_key_slow_dec_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + + RCU_INIT_POINTER(dev->xps_maps[type], NULL); + + kfree_rcu(dev_maps, rcu); +} + +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, + u16 offset, u16 count) +{ + struct xps_dev_maps *dev_maps; + bool active = false; + int i, j; + + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (!dev_maps) + return; + + for (j = 0; j < dev_maps->nr_ids; j++) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); + if (!active) + reset_xps_maps(dev, dev_maps, type); + + if (type == XPS_CPUS) { + for (i = offset + (count - 1); count--; i--) + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } +} + +static void netif_reset_xps_queues(struct net_device *dev, u16 offset, + u16 count) +{ + if (!static_key_false(&xps_needed)) + return; + + cpus_read_lock(); + mutex_lock(&xps_map_mutex); - for (i = index; i < dev->num_tx_queues; i++) - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + if (static_key_false(&xps_rxqs_needed)) + clean_xps_maps(dev, XPS_RXQS, offset, count); + + clean_xps_maps(dev, XPS_CPUS, offset, count); -out_no_maps: mutex_unlock(&xps_map_mutex); + cpus_read_unlock(); } -static struct xps_map *expand_xps_map(struct xps_map *map, - int cpu, u16 index) +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ + netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); +} + +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, + u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; @@ -1895,7 +2780,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return map; } - /* Need to add queue to this CPU's existing map */ + /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; @@ -1903,9 +2788,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map, alloc_len = map->alloc_len * 2; } - /* Need to allocate new map to store queue on this CPU's map */ - new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, - cpu_to_node(cpu)); + /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's + * map + */ + if (is_rxqs_map) + new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); + else + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(attr_index)); if (!new_map) return NULL; @@ -1917,108 +2807,205 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return new_map; } -int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index) +/* Copy xps maps at a given index */ +static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *new_dev_maps, int index, + int tc, bool skip_tc) { - struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + int i, tci = index * dev_maps->num_tc; + struct xps_map *map; + + /* copy maps belonging to foreign traffic classes */ + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && skip_tc) + continue; + + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); + } +} + +/* Must be called under cpus_read_lock */ +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, enum xps_map_type type) +{ + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; + const unsigned long *online_mask = NULL; + bool active = false, copy = false; + int i, j, tci, numa_node_id = -2; + int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); - int cpu, numa_node_id = -2; - bool active = false; + unsigned int nr_ids; + + WARN_ON_ONCE(index >= dev->num_tx_queues); + + if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ + num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (type == XPS_RXQS) { + maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); + nr_ids = dev->num_rx_queues; + } else { + maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); + if (num_possible_cpus() > 1) + online_mask = cpumask_bits(cpu_online_mask); + nr_ids = nr_cpu_ids; + } - /* allocate memory for queue storage */ - for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mask)) - continue; + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; - if (!new_dev_maps) - new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + /* The old dev_maps could be larger or smaller than the one we're + * setting up now, as dev->num_tc or nr_ids could have been updated in + * between. We could try to be smart, but let's be safe instead and only + * copy foreign traffic classes if the two map sizes match. + */ + if (dev_maps && + dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) + copy = true; + + /* allocate memory for queue storage */ + for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), + j < nr_ids;) { if (!new_dev_maps) { - mutex_unlock(&xps_map_mutex); - return -ENOMEM; + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + new_dev_maps->nr_ids = nr_ids; + new_dev_maps->num_tc = num_tc; } - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : - NULL; + tci = j * num_tc + tc; + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, cpu, index); + map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; - for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { - /* add queue to CPU maps */ + if (!dev_maps) { + /* Increment static keys at most once per type */ + static_key_slow_inc_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + } + + for (j = 0; j < nr_ids; j++) { + bool skip_tc = false; + + tci = j * num_tc + tc; + if (netif_attr_test_mask(j, mask, nr_ids) && + netif_attr_test_online(j, online_mask, nr_ids)) { + /* add tx-queue to CPU/rx-queue maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + skip_tc = true; + + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (numa_node_id == -2) - numa_node_id = cpu_to_node(cpu); - else if (numa_node_id != cpu_to_node(cpu)) - numa_node_id = -1; + if (type == XPS_CPUS) { + if (numa_node_id == -2) + numa_node_id = cpu_to_node(j); + else if (numa_node_id != cpu_to_node(j)) + numa_node_id = -1; + } #endif - } else if (dev_maps) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[cpu]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); } + if (copy) + xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, + skip_tc); } - rcu_assign_pointer(dev->xps_maps, new_dev_maps); + rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ - if (dev_maps) { - for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = xmap_dereference(dev_maps->cpu_map[cpu]); - if (map && map != new_map) - kfree_rcu(map, rcu); - } + if (!dev_maps) + goto out_no_old_maps; + + for (j = 0; j < dev_maps->nr_ids; j++) { + for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { + map = xmap_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; - kfree_rcu(dev_maps, rcu); + if (copy) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + if (map == new_map) + continue; + } + + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); + kfree_rcu(map, rcu); + } } + old_dev_maps = dev_maps; + +out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: - /* update Tx queue numa node */ - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), - (numa_node_id >= 0) ? numa_node_id : - NUMA_NO_NODE); + if (type == XPS_CPUS) + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? + numa_node_id : NUMA_NO_NODE); if (!dev_maps) goto out_no_maps; - /* removes queue from unused CPUs */ - for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) - continue; + /* removes tx-queue from unused CPUs/rx-queues */ + for (j = 0; j < dev_maps->nr_ids; j++) { + tci = j * dev_maps->num_tc; + + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && + netif_attr_test_mask(j, mask, dev_maps->nr_ids) && + netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) + continue; - if (remove_xps_queue(dev_maps, cpu, index)) - active = true; + active |= remove_xps_queue(dev_maps, + copy ? old_dev_maps : NULL, + tci, index); + } } + if (old_dev_maps) + kfree_rcu(old_dev_maps, rcu); + /* free map if not active */ - if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); - } + if (!active) + reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2026,12 +3013,15 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : - NULL; - if (new_map && new_map != map) - kfree(new_map); + for (j = 0; j < nr_ids; j++) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + map = copy ? + xmap_dereference(dev_maps->attr_map[tci]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } } mutex_unlock(&xps_map_mutex); @@ -2039,23 +3029,157 @@ error: kfree(new_dev_maps); return -ENOMEM; } +EXPORT_SYMBOL_GPL(__netif_set_xps_queue); + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + int ret; + + cpus_read_lock(); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); + cpus_read_unlock(); + + return ret; +} EXPORT_SYMBOL(netif_set_xps_queue); #endif +static void netdev_unbind_all_sb_channels(struct net_device *dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + + /* Unbind any subordinate channels */ + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev) + netdev_unbind_sb_channel(dev, txq->sb_dev); + } +} + +void netdev_reset_tc(struct net_device *dev) +{ +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + netdev_unbind_all_sb_channels(dev); + + /* Reset TC configuration of device */ + dev->num_tc = 0; + memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); + memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); +} +EXPORT_SYMBOL(netdev_reset_tc); + +int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) +{ + if (tc >= dev->num_tc) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues(dev, offset, count); +#endif + dev->tc_to_txq[tc].count = count; + dev->tc_to_txq[tc].offset = offset; + return 0; +} +EXPORT_SYMBOL(netdev_set_tc_queue); + +int netdev_set_num_tc(struct net_device *dev, u8 num_tc) +{ + if (num_tc > TC_MAX_QUEUE) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + netdev_unbind_all_sb_channels(dev); + + dev->num_tc = num_tc; + return 0; +} +EXPORT_SYMBOL(netdev_set_num_tc); + +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(sb_dev, 0); +#endif + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); + + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev == sb_dev) + txq->sb_dev = NULL; + } +} +EXPORT_SYMBOL(netdev_unbind_sb_channel); + +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset) +{ + /* Make certain the sb_dev and dev are already configured */ + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) + return -EINVAL; + + /* We cannot hand out queues we don't have */ + if ((offset + count) > dev->real_num_tx_queues) + return -EINVAL; + + /* Record the mapping */ + sb_dev->tc_to_txq[tc].count = count; + sb_dev->tc_to_txq[tc].offset = offset; + + /* Provide a way for Tx queue to find the tc_to_txq map or + * XPS map for itself. + */ + while (count--) + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; + + return 0; +} +EXPORT_SYMBOL(netdev_bind_sb_channel_queue); + +int netdev_set_sb_channel(struct net_device *dev, u16 channel) +{ + /* Do not use a multiqueue device to represent a subordinate channel */ + if (netif_is_multiqueue(dev)) + return -ENODEV; + + /* We allow channels 1 - 32767 to be used for subordinate channels. + * Channel 0 is meant to be "native" mode and used only to represent + * the main root device. We allow writing 0 to reset the device back + * to normal mode after being used as a subordinate channel. + */ + if (channel > S16_MAX) + return -EINVAL; + + dev->num_tc = -channel; + + return 0; +} +EXPORT_SYMBOL(netdev_set_sb_channel); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues - * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { + bool disabling; int rc; + disabling = txq < dev->real_num_tx_queues; + if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); @@ -2065,20 +3189,27 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); - if (txq < dev->real_num_tx_queues) { + net_shaper_set_real_num_tx_queues(dev, txq); + + dev_qdisc_change_real_num_tx(dev, txq); + + dev->real_num_tx_queues = txq; + + if (disabling) { + synchronize_net(); qdisc_reset_all_tx_gt(dev, txq); #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, txq); #endif } + } else { + dev->real_num_tx_queues = txq; } - dev->real_num_tx_queues = txq; return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_RPS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -2097,7 +3228,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); @@ -2109,27 +3240,131 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); -#endif + +/** + * netif_set_real_num_queues - set actual number of RX and TX queues used + * @dev: Network device + * @txq: Actual number of TX queues + * @rxq: Actual number of RX queues + * + * Set the real number of both TX and RX queues. + * Does nothing if the number of queues is already correct. + */ +int netif_set_real_num_queues(struct net_device *dev, + unsigned int txq, unsigned int rxq) +{ + unsigned int old_rxq = dev->real_num_rx_queues; + int err; + + if (txq < 1 || txq > dev->num_tx_queues || + rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + /* Start from increases, so the error path only does decreases - + * decreases can't fail. + */ + if (rxq > dev->real_num_rx_queues) { + err = netif_set_real_num_rx_queues(dev, rxq); + if (err) + return err; + } + if (txq > dev->real_num_tx_queues) { + err = netif_set_real_num_tx_queues(dev, txq); + if (err) + goto undo_rx; + } + if (rxq < dev->real_num_rx_queues) + WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); + if (txq < dev->real_num_tx_queues) + WARN_ON(netif_set_real_num_tx_queues(dev, txq)); + + return 0; +undo_rx: + WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); + return err; +} +EXPORT_SYMBOL(netif_set_real_num_queues); + +/** + * netif_set_tso_max_size() - set the max size of TSO frames supported + * @dev: netdev to update + * @size: max skb->len of a TSO frame + * + * Set the limit on the size of TSO super-frames the device can handle. + * Unless explicitly set the stack will assume the value of + * %GSO_LEGACY_MAX_SIZE. + */ +void netif_set_tso_max_size(struct net_device *dev, unsigned int size) +{ + dev->tso_max_size = min(GSO_MAX_SIZE, size); + if (size < READ_ONCE(dev->gso_max_size)) + netif_set_gso_max_size(dev, size); + if (size < READ_ONCE(dev->gso_ipv4_max_size)) + netif_set_gso_ipv4_max_size(dev, size); +} +EXPORT_SYMBOL(netif_set_tso_max_size); + +/** + * netif_set_tso_max_segs() - set the max number of segs supported for TSO + * @dev: netdev to update + * @segs: max number of TCP segments + * + * Set the limit on the number of TCP segments the device can generate from + * a single TSO super-frame. + * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. + */ +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) +{ + dev->tso_max_segs = segs; + if (segs < READ_ONCE(dev->gso_max_segs)) + netif_set_gso_max_segs(dev, segs); +} +EXPORT_SYMBOL(netif_set_tso_max_segs); + +/** + * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper + * @to: netdev to update + * @from: netdev from which to copy the limits + */ +void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) +{ + netif_set_tso_max_size(to, from->tso_max_size); + netif_set_tso_max_segs(to, from->tso_max_segs); +} +EXPORT_SYMBOL(netif_inherit_tso_max); /** * netif_get_num_default_rss_queues - default number of RSS queues * - * This routine should set an upper limit on the number of RSS queues - * used by default by multiqueue devices. + * Default value is the number of physical cores if there are only 1 or 2, or + * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { - return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); + cpumask_var_t cpus; + int cpu, count = 0; + + if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) + return 1; + + cpumask_copy(cpus, cpu_online_mask); + for_each_cpu(cpu, cpus) { + ++count; + cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); + } + free_cpumask_var(cpus); + + return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); -static inline void __netif_reschedule(struct Qdisc *q) +static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); + sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; @@ -2139,35 +3374,82 @@ static inline void __netif_reschedule(struct Qdisc *q) void __netif_schedule(struct Qdisc *q) { + /* If q->defer_list is not empty, at least one thread is + * in __dev_xmit_skb() before llist_del_all(&q->defer_list). + * This thread will attempt to run the queue. + */ + if (!llist_empty(&q->defer_list)) + return; + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } EXPORT_SYMBOL(__netif_schedule); -void dev_kfree_skb_irq(struct sk_buff *skb) +struct dev_kfree_skb_cb { + enum skb_drop_reason reason; +}; + +static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) { - if (atomic_dec_and_test(&skb->users)) { - struct softnet_data *sd; - unsigned long flags; + return (struct dev_kfree_skb_cb *)skb->cb; +} - local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); - skb->next = sd->completion_queue; - sd->completion_queue = skb; - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_restore(flags); +void netif_schedule_queue(struct netdev_queue *txq) +{ + rcu_read_lock(); + if (!netif_xmit_stopped(txq)) { + struct Qdisc *q = rcu_dereference(txq->qdisc); + + __netif_schedule(q); } + rcu_read_unlock(); } -EXPORT_SYMBOL(dev_kfree_skb_irq); +EXPORT_SYMBOL(netif_schedule_queue); -void dev_kfree_skb_any(struct sk_buff *skb) +void netif_tx_wake_queue(struct netdev_queue *dev_queue) { - if (in_irq() || irqs_disabled()) - dev_kfree_skb_irq(skb); + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(dev_queue->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_tx_wake_queue); + +void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) +{ + unsigned long flags; + + if (unlikely(!skb)) + return; + + if (likely(refcount_read(&skb->users) == 1)) { + smp_rmb(); + refcount_set(&skb->users, 0); + } else if (likely(!refcount_dec_and_test(&skb->users))) { + return; + } + get_kfree_skb_cb(skb)->reason = reason; + local_irq_save(flags); + skb->next = __this_cpu_read(softnet_data.completion_queue); + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(dev_kfree_skb_irq_reason); + +void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason) +{ + if (in_hardirq() || irqs_disabled()) + dev_kfree_skb_irq_reason(skb, reason); else - dev_kfree_skb(skb); + kfree_skb_reason(skb, reason); } -EXPORT_SYMBOL(dev_kfree_skb_any); +EXPORT_SYMBOL(dev_kfree_skb_any_reason); /** @@ -2196,29 +3478,68 @@ void netif_device_attach(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); - __netdev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); -static void skb_warn_bad_offload(const struct sk_buff *skb) +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +static u16 skb_tx_hash(const struct net_device *dev, + const struct net_device *sb_dev, + struct sk_buff *skb) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = dev->real_num_tx_queues; + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + + qoffset = sb_dev->tc_to_txq[tc].offset; + qcount = sb_dev->tc_to_txq[tc].count; + if (unlikely(!qcount)) { + net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", + sb_dev->name, qoffset, tc); + qoffset = 0; + qcount = dev->real_num_tx_queues; + } + } + + if (skb_rx_queue_recorded(skb)) { + DEBUG_NET_WARN_ON_ONCE(qcount == 0); + hash = skb_get_rx_queue(skb); + if (hash >= qoffset) + hash -= qoffset; + while (unlikely(hash >= qcount)) + hash -= qcount; + return hash + qoffset; + } + + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; +} + +void skb_warn_bad_offload(const struct sk_buff *skb) { - static const netdev_features_t null_features = 0; + static const netdev_features_t null_features; struct net_device *dev = skb->dev; - const char *driver = ""; + const char *name = ""; if (!net_ratelimit()) return; - if (dev && dev->dev.parent) - driver = dev_driver_string(dev->dev.parent); - - WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " - "gso_type=%d ip_summed=%d\n", - driver, dev ? &dev->features : &null_features, - skb->sk ? &skb->sk->sk_route_caps : &null_features, - skb->len, skb->data_len, skb_shinfo(skb)->gso_size, - skb_shinfo(skb)->gso_type, skb->ip_summed); + if (dev) { + if (dev->dev.parent) + name = dev_driver_string(dev->dev.parent); + else + name = netdev_name(dev); + } + skb_dump(KERN_WARNING, skb, false); + WARN(1, "%s: caps=(%pNF, %pNF)\n", + name, dev ? &dev->features : &null_features, + skb->sk ? &skb->sk->sk_route_caps : &null_features); } /* @@ -2233,11 +3554,15 @@ int skb_checksum_help(struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; - if (unlikely(skb_shinfo(skb)->gso_size)) { + if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } + if (!skb_frags_readable(skb)) { + return -EFAULT; + } + /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ @@ -2248,20 +3573,27 @@ int skb_checksum_help(struct sk_buff *skb) } offset = skb_checksum_start_offset(skb); - BUG_ON(offset >= skb_headlen(skb)); + ret = -EINVAL; + if (unlikely(offset >= skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", + offset, skb_headlen(skb)); + goto out; + } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; - BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__sum16))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; + if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", + offset + sizeof(__sum16), skb_headlen(skb)); + goto out; } + ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); + if (ret) + goto out; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); + *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: @@ -2269,159 +3601,93 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -__be16 skb_network_protocol(struct sk_buff *skb) +#ifdef CONFIG_NET_CRC32C +int skb_crc32c_csum_help(struct sk_buff *skb) { - __be16 type = skb->protocol; - int vlan_depth = ETH_HLEN; + u32 crc; + int ret = 0, offset, start; - /* Tunnel gso handlers can set protocol to ethernet. */ - if (type == htons(ETH_P_TEB)) { - struct ethhdr *eth; + if (skb->ip_summed != CHECKSUM_PARTIAL) + goto out; - if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) - return 0; + if (unlikely(skb_is_gso(skb))) + goto out; - eth = (struct ethhdr *)skb_mac_header(skb); - type = eth->h_proto; + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (unlikely(skb_has_shared_frag(skb))) { + ret = __skb_linearize(skb); + if (ret) + goto out; } - - while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vh; - - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return 0; - - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; + start = skb_checksum_start_offset(skb); + offset = start + offsetof(struct sctphdr, checksum); + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + ret = -EINVAL; + goto out; } - return type; + ret = skb_ensure_writable(skb, offset + sizeof(__le32)); + if (ret) + goto out; + + crc = ~skb_crc32c(skb, start, skb->len - start, ~0); + *(__le32 *)(skb->data + offset) = cpu_to_le32(crc); + skb_reset_csum_not_inet(skb); +out: + return ret; } +EXPORT_SYMBOL(skb_crc32c_csum_help); +#endif /* CONFIG_NET_CRC32C */ -/** - * skb_mac_gso_segment - mac layer segmentation handler. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth) { - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - __be16 type = skb_network_protocol(skb); + __be16 type = skb->protocol; - if (unlikely(!type)) - return ERR_PTR(-EINVAL); + /* Tunnel gso handlers can set protocol to ethernet. */ + if (type == htons(ETH_P_TEB)) { + struct ethhdr *eth; - __skb_pull(skb, skb->mac_len); + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + return 0; - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - int err; - - err = ptype->callbacks.gso_send_check(skb); - segs = ERR_PTR(err); - if (err || skb_gso_ok(skb, features)) - break; - __skb_push(skb, (skb->data - - skb_network_header(skb))); - } - segs = ptype->callbacks.gso_segment(skb, features); - break; - } + eth = (struct ethhdr *)skb->data; + type = eth->h_proto; } - rcu_read_unlock(); - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; + return vlan_get_protocol_and_depth(skb, type, depth); } -EXPORT_SYMBOL(skb_mac_gso_segment); -/* openvswitch calls this on rx path, so we need a different check. - */ -static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { - if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL; - else - return skb->ip_summed == CHECKSUM_NONE; + netdev_err(dev, "hw csum failure\n"); + skb_dump(KERN_ERR, skb, true); + dump_stack(); } -/** - * __skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @tx_path: whether it is called in TX path - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - */ -struct sk_buff *__skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, bool tx_path) +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { - if (unlikely(skb_needs_check(skb, tx_path))) { - int err; - - skb_warn_bad_offload(skb); - - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) - return ERR_PTR(err); - } - - SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - return skb_mac_gso_segment(skb, features); -} -EXPORT_SYMBOL(__skb_gso_segment); - -/* Take action when hardware reception checksum errors are detected. */ -#ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) -{ - if (net_ratelimit()) { - pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); - dump_stack(); - } + DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif -/* Actually, we should eliminate this check as soon as we know, that: - * 1. IOMMU is present and allows to map all the memory. - * 2. No high memory really exists on this machine. - */ - +/* XXX: check that highmem exists at all on the given machine. */ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; + if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - if (PageHighMem(skb_frag_page(frag))) - return 1; - } - } - - if (PCI_DMA_BUS_IS_PHYS) { - struct device *pdev = dev->dev.parent; + struct page *page = skb_frag_page(frag); - if (!pdev) - return 0; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - dma_addr_t addr = page_to_phys(skb_frag_page(frag)); - if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) + if (page && PageHighMem(page)) return 1; } } @@ -2429,385 +3695,1046 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) return 0; } -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#if IS_ENABLED(CONFIG_NET_MPLS_GSO) +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + if (eth_p_mpls(type)) + features &= skb->dev->mpls_features; -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) + return features; +} +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + return features; +} +#endif -static void dev_gso_skb_destructor(struct sk_buff *skb) +static netdev_features_t harmonize_features(struct sk_buff *skb, + netdev_features_t features) { - struct dev_gso_cb *cb; + __be16 type; - do { - struct sk_buff *nskb = skb->next; + type = skb_network_protocol(skb, NULL); + features = net_mpls_features(skb, features, type); - skb->next = nskb->next; - nskb->next = NULL; - kfree_skb(nskb); - } while (skb->next); + if (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, type)) { + features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); + } + if (illegal_highdma(skb->dev, skb)) + features &= ~NETIF_F_SG; - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); + return features; } -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * @features: device features as applicable to this skb - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) +netdev_features_t passthru_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) { - struct sk_buff *segs; + return features; +} +EXPORT_SYMBOL(passthru_features_check); - segs = skb_gso_segment(skb, features); +static netdev_features_t dflt_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + return vlan_features_check(skb, features); +} - /* Verifying header integrity only. */ - if (!segs) - return 0; +static netdev_features_t gso_features_check(const struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + u16 gso_segs = skb_shinfo(skb)->gso_segs; - if (IS_ERR(segs)) - return PTR_ERR(segs); + if (gso_segs > READ_ONCE(dev->gso_max_segs)) + return features & ~NETIF_F_GSO_MASK; - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; + if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) + return features & ~NETIF_F_GSO_MASK; - return 0; -} + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } -static netdev_features_t harmonize_features(struct sk_buff *skb, - netdev_features_t features) -{ - if (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, skb_network_protocol(skb))) { - features &= ~NETIF_F_ALL_CSUM; - } else if (illegal_highdma(skb->dev, skb)) { - features &= ~NETIF_F_SG; + /* Support for GSO partial features requires software + * intervention before we can actually process the packets + * so we need to strip support for any partial features now + * and we can pull them back in after we have partially + * segmented the frame. + */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) + features &= ~dev->gso_partial_features; + + /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header + * has the potential to be fragmented so that TSO does not generate + * segments with the same ID. For encapsulated packets, the ID mangling + * feature is guaranteed not to use the same ID for the outer IPv4 + * headers of the generated segments if the headers have the potential + * to be fragmented, so there is no need to clear the IPv4 ID mangling + * feature (see the section about NETIF_F_TSO_MANGLEID in + * segmentation-offloads.rst). + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + struct iphdr *iph = skb->encapsulation ? + inner_ip_hdr(skb) : ip_hdr(skb); + + if (!(iph->frag_off & htons(IP_DF))) + features &= ~NETIF_F_TSO_MANGLEID; } + /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, + * so neither does TSO that depends on it. + */ + if (features & NETIF_F_IPV6_CSUM && + (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && + skb_transport_header_was_set(skb) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); + return features; } netdev_features_t netif_skb_features(struct sk_buff *skb) { - __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; - - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) - features &= ~NETIF_F_GSO_MASK; - - if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - protocol = veh->h_vlan_encapsulated_proto; - } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, features); - } + struct net_device *dev = skb->dev; + netdev_features_t features = dev->features; - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); + if (skb_is_gso(skb)) + features = gso_features_check(skb, dev, features); - if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) - features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + + if (skb_vlan_tagged(skb)) + features = netdev_intersect_features(features, + dev->vlan_features | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); + + if (dev->netdev_ops->ndo_features_check) + features &= dev->netdev_ops->ndo_features_check(skb, dev, + features); + else + features &= dflt_features_check(skb, dev, features); return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); -/* - * Returns true if either: - * 1. skb has frag_list and the device doesn't support FRAGLIST, or - * 2. skb is fragmented and the device does not support SG. - */ -static inline int skb_needs_linearize(struct sk_buff *skb, - netdev_features_t features) +static int xmit_one(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, bool more) { - return skb_is_nonlinear(skb) && - ((skb_has_frag_list(skb) && - !(features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && - !(features & NETIF_F_SG))); + unsigned int len; + int rc; + + if (dev_nit_active_rcu(dev)) + dev_queue_xmit_nit(skb, dev); + + len = skb->len; + trace_net_dev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq, more); + trace_net_dev_xmit(skb, rc, dev, len); + + return rc; } -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, + struct netdev_queue *txq, int *ret) { - const struct net_device_ops *ops = dev->netdev_ops; + struct sk_buff *skb = first; int rc = NETDEV_TX_OK; - unsigned int skb_len; - if (likely(!skb->next)) { - netdev_features_t features; + while (skb) { + struct sk_buff *next = skb->next; - /* - * If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); + skb_mark_not_on_list(skb); + rc = xmit_one(skb, dev, txq, next != NULL); + if (unlikely(!dev_xmit_complete(rc))) { + skb->next = next; + goto out; + } - features = netif_skb_features(skb); + skb = next; + if (netif_tx_queue_stopped(txq) && skb) { + rc = NETDEV_TX_BUSY; + break; + } + } - if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - goto out; +out: + *ret = rc; + return skb; +} - skb->vlan_tci = 0; - } +static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, + netdev_features_t features) +{ + if (skb_vlan_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) + skb = __vlan_hwaccel_push_inside(skb); + return skb; +} - /* If encapsulation offload request, verify we are testing - * hardware encapsulation features instead of standard - * features for the netdev - */ - if (skb->encapsulation) - features &= dev->hw_enc_features; +int skb_csum_hwoffload_help(struct sk_buff *skb, + const netdev_features_t features) +{ + if (unlikely(skb_csum_is_sctp(skb))) + return !!(features & NETIF_F_SCTP_CRC) ? 0 : + skb_crc32c_csum_help(skb); - if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } else { - if (skb_needs_linearize(skb, features) && - __skb_linearize(skb)) - goto out_kfree_skb; + if (features & NETIF_F_HW_CSUM) + return 0; - /* If packet is not checksummed and device does not - * support checksumming for this protocol, complete - * checksumming here. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->encapsulation) - skb_set_inner_transport_header(skb, - skb_checksum_start_offset(skb)); - else - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && - skb_checksum_help(skb)) - goto out_kfree_skb; - } + if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { + if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + goto sw_checksum; + + switch (skb->csum_offset) { + case offsetof(struct tcphdr, check): + case offsetof(struct udphdr, check): + return 0; } + } - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); +sw_checksum: + return skb_checksum_help(skb); +} +EXPORT_SYMBOL(skb_csum_hwoffload_help); - skb_len = skb->len; - rc = ops->ndo_start_xmit(skb, dev); - trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) - txq_trans_update(txq); - return rc; +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. + */ +static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); + struct sock *sk = skb->sk; + + sk_validate = NULL; + if (sk) { + if (sk_fullsock(sk)) + sk_validate = sk->sk_validate_xmit_skb; + else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT) + sk_validate = inet_twsk(sk)->tw_validate_xmit_skb; + } + + if (sk_validate) { + skb = sk_validate(sk, dev, skb); + } else if (unlikely(skb_is_decrypted(skb))) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; } +#endif -gso: - do { - struct sk_buff *nskb = skb->next; - - skb->next = nskb->next; - nskb->next = NULL; - - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(nskb, dev); - - skb_len = nskb->len; - rc = ops->ndo_start_xmit(nskb, dev); - trace_net_dev_xmit(nskb, rc, dev, skb_len); - if (unlikely(rc != NETDEV_TX_OK)) { - if (rc & ~NETDEV_TX_MASK) - goto out_kfree_gso_skb; - nskb->next = skb->next; - skb->next = nskb; - return rc; + return skb; +} + +static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct skb_shared_info *shinfo; + struct net_iov *niov; + + if (likely(skb_frags_readable(skb))) + goto out; + + if (!dev->netmem_tx) + goto out_free; + + shinfo = skb_shinfo(skb); + + if (shinfo->nr_frags > 0) { + niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); + if (net_is_devmem_iov(niov) && + net_devmem_iov_binding(niov)->dev != dev) + goto out_free; + } + +out: + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) +{ + netdev_features_t features; + + skb = validate_xmit_unreadable_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + + features = netif_skb_features(skb); + skb = validate_xmit_vlan(skb, features); + if (unlikely(!skb)) + goto out_null; + + skb = sk_validate_xmit_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + + if (netif_needs_gso(skb, features)) { + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + if (IS_ERR(segs)) { + goto out_kfree_skb; + } else if (segs) { + consume_skb(skb); + skb = segs; } - txq_trans_update(txq); - if (unlikely(netif_xmit_stopped(txq) && skb->next)) - return NETDEV_TX_BUSY; - } while (skb->next); + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; -out_kfree_gso_skb: - if (likely(skb->next == NULL)) { - skb->destructor = DEV_GSO_CB(skb)->destructor; - consume_skb(skb); - return rc; + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (skb_csum_hwoffload_help(skb, features)) + goto out_kfree_skb; + } } + + skb = validate_xmit_xfrm(skb, features, again); + + return skb; + out_kfree_skb: kfree_skb(skb); -out: - return rc; +out_null: + dev_core_stats_tx_dropped_inc(dev); + return NULL; +} + +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) +{ + struct sk_buff *next, *head = NULL, *tail; + + for (; skb != NULL; skb = next) { + next = skb->next; + skb_mark_not_on_list(skb); + + /* in case skb won't be segmented, point to itself */ + skb->prev = skb; + + skb = validate_xmit_skb(skb, dev, again); + if (!skb) + continue; + + if (!head) + head = skb; + else + tail->next = skb; + /* If skb was segmented, skb->prev points to + * the last segment. If not, it still contains skb. + */ + tail = skb->prev; + } + return head; } +EXPORT_SYMBOL_GPL(validate_xmit_skb_list); -static void qdisc_pkt_len_init(struct sk_buff *skb) +static void qdisc_pkt_len_segs_init(struct sk_buff *skb) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); + struct skb_shared_info *shinfo = skb_shinfo(skb); + u16 gso_segs; qdisc_skb_cb(skb)->pkt_len = skb->len; + if (!shinfo->gso_size) { + qdisc_skb_cb(skb)->pkt_segs = 1; + return; + } + + qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ - if (shinfo->gso_size) { + if (skb_transport_header_was_set(skb)) { unsigned int hdr_len; - u16 gso_segs = shinfo->gso_segs; /* mac layer + network layer */ - hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + if (!skb->encapsulation) + hdr_len = skb_transport_offset(skb); + else + hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, hdr_len, + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, hdr_len, + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } - if (shinfo->gso_type & SKB_GSO_DODGY) - gso_segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); + if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) { + int payload = skb->len - hdr_len; + /* Malicious packet. */ + if (payload <= 0) + return; + gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); + shinfo->gso_segs = gso_segs; + qdisc_skb_cb(skb)->pkt_segs = gso_segs; + } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } +static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, + struct sk_buff **to_free, + struct netdev_queue *txq) +{ + int rc; + + rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; + if (rc == NET_XMIT_SUCCESS) + trace_qdisc_enqueue(q, txq, skb); + return rc; +} + static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { + struct sk_buff *next, *to_free = NULL, *to_free2 = NULL; spinlock_t *root_lock = qdisc_lock(q); - bool contended; + struct llist_node *ll_list, *first_n; + unsigned long defer_count = 0; int rc; - qdisc_pkt_len_init(skb); qdisc_calculate_pkt_len(skb, q); - /* - * Heuristic to force contended enqueues to serialize on a - * separate lock before trying to get qdisc main lock. - * This permits __QDISC_STATE_RUNNING owner to get the lock more often - * and dequeue packets faster. + + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); + + if (q->flags & TCQ_F_NOLOCK) { + if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && + qdisc_run_begin(q)) { + /* Retest nolock_qdisc_is_empty() within the protection + * of q->seqlock to protect from racing with requeuing. + */ + if (unlikely(!nolock_qdisc_is_empty(q))) { + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + __qdisc_run(q); + to_free2 = qdisc_run_end(q); + + goto free_skbs; + } + + qdisc_bstats_cpu_update(q, skb); + if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && + !nolock_qdisc_is_empty(q)) + __qdisc_run(q); + + to_free2 = qdisc_run_end(q); + rc = NET_XMIT_SUCCESS; + goto free_skbs; + } + + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + to_free2 = qdisc_run(q); + goto free_skbs; + } + + /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. + * In the try_cmpxchg() loop, we want to increment q->defer_count + * at most once to limit the number of skbs in defer_list. + * We perform the defer_count increment only if the list is not empty, + * because some arches have slow atomic_long_inc_return(). */ - contended = qdisc_is_running(q); - if (unlikely(contended)) - spin_lock(&q->busylock); + first_n = READ_ONCE(q->defer_list.first); + do { + if (first_n && !defer_count) { + defer_count = atomic_long_inc_return(&q->defer_count); + if (unlikely(defer_count > READ_ONCE(q->limit))) { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + return NET_XMIT_DROP; + } + } + skb->ll_node.next = first_n; + } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node)); + + /* If defer_list was not empty, we know the cpu which queued + * the first skb will process the whole list for us. + */ + if (first_n) + return NET_XMIT_SUCCESS; spin_lock(root_lock); + + ll_list = llist_del_all(&q->defer_list); + /* There is a small race because we clear defer_count not atomically + * with the prior llist_del_all(). This means defer_list could grow + * over q->limit. + */ + atomic_long_set(&q->defer_count, 0); + + ll_list = llist_reverse_order(ll_list); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - kfree_skb(skb); + llist_for_each_entry_safe(skb, next, ll_list, ll_node) + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; - } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && - qdisc_run_begin(q)) { + goto unlock; + } + if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + !llist_next(ll_list) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) - skb_dst_force(skb); + DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list, + struct sk_buff, + ll_node)); qdisc_bstats_update(q, skb); - - if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) __qdisc_run(q); - } else - qdisc_run_end(q); - + to_free2 = qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { - skb_dst_force(skb); - rc = q->enqueue(skb, q) & NET_XMIT_MASK; - if (qdisc_run_begin(q)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } - __qdisc_run(q); + int count = 0; + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + prefetch(next); + prefetch(&next->priority); + skb_mark_not_on_list(skb); + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + count++; } + to_free2 = qdisc_run(q); + if (count != 1) + rc = NET_XMIT_SUCCESS; } +unlock: spin_unlock(root_lock); - if (unlikely(contended)) - spin_unlock(&q->busylock); + +free_skbs: + tcf_kfree_skb_list(to_free); + tcf_kfree_skb_list(to_free2); return rc; } -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { - struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + const struct netprio_map *map; + const struct sock *sk; + unsigned int prioidx; - if (!skb->priority && skb->sk && map) { - unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + if (skb->priority) + return; + map = rcu_dereference_bh(skb->dev->priomap); + if (!map) + return; + sk = skb_to_full_sk(skb); + if (!sk) + return; - if (prioidx < map->priomap_len) - skb->priority = map->priomap[prioidx]; - } + prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); + + if (prioidx < map->priomap_len) + skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) #endif -static DEFINE_PER_CPU(int, xmit_recursion); -#define RECURSION_LIMIT 10 - /** * dev_loopback_xmit - loop back @skb + * @net: network namespace this loopback is happening in + * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ -int dev_loopback_xmit(struct sk_buff *skb) +int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; - skb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(skb)); + if (skb->ip_summed == CHECKSUM_NONE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); skb_dst_force(skb); - netif_rx_ni(skb); + netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); +#ifdef CONFIG_NET_EGRESS +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) +{ + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +#ifndef CONFIG_PREEMPT_RT +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); + +#else +static bool netdev_xmit_txqueue_skipped(void) +{ + return current->net_xmit.skip_txqueue; +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + current->net_xmit.skip_txqueue = skip; +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif +#endif /* CONFIG_NET_EGRESS */ + +#ifdef CONFIG_NET_XGRESS +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, + enum skb_drop_reason *drop_reason) +{ + int ret = TC_ACT_UNSPEC; +#ifdef CONFIG_NET_CLS_ACT + struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); + struct tcf_result res; + + if (!miniq) + return ret; + + /* Global bypass */ + if (!static_branch_likely(&tcf_sw_enabled_key)) + return ret; + + /* Block-wise bypass */ + if (tcf_block_bypass_sw(miniq->block)) + return ret; + + tc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; + tcf_set_drop_reason(skb, *drop_reason); + + mini_qdisc_bstats_cpu_update(miniq, skb); + ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); + /* Only tcf related quirks below. */ + switch (ret) { + case TC_ACT_SHOT: + *drop_reason = tcf_get_drop_reason(skb); + mini_qdisc_qstats_cpu_drop(miniq); + break; + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(res.classid); + break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return ret; +} + +static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); + +void tcx_inc(void) +{ + static_branch_inc(&tcx_needed_key); +} + +void tcx_dec(void) +{ + static_branch_dec(&tcx_needed_key); +} + +static __always_inline enum tcx_action_base +tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + const bool needs_mac) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + int ret = TCX_NEXT; + + if (needs_mac) + __skb_push(skb, skb->mac_len); + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != TCX_NEXT) + break; + } + if (needs_mac) + __skb_pull(skb, skb->mac_len); + return tcx_action_code(skb, ret); +} + +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); + enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int sch_ret; + + if (!entry) + return skb; + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + if (unlikely(*pt_prev)) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_pkt_len_segs_init(skb); + tcx_set_ingress(skb, true); + + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, true); + if (sch_ret != TC_ACT_UNSPEC) + goto ingress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); +ingress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by BPF, so we can safely + * push the L2 header back before redirecting to another + * netdev. + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + case TC_ACT_SHOT: + kfree_skb_reason(skb, drop_reason); + *ret = NET_RX_DROP; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + } + bpf_net_ctx_clear(bpf_net_ctx); + + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); + enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int sch_ret; + + if (!entry) + return skb; + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was + * already set by the caller. + */ + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, false); + if (sch_ret != TC_ACT_UNSPEC) + goto egress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); +egress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* No need to push/pop skb's mac_header here on egress! */ + skb_do_redirect(skb); + *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + case TC_ACT_SHOT: + kfree_skb_reason(skb, drop_reason); + *ret = NET_XMIT_DROP; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + } + bpf_net_ctx_clear(bpf_net_ctx); + + return skb; +} +#else +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + return skb; +} +#endif /* CONFIG_NET_XGRESS */ + +#ifdef CONFIG_XPS +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, + struct xps_dev_maps *dev_maps, unsigned int tci) +{ + int tc = netdev_get_prio_tc_map(dev, skb->priority); + struct xps_map *map; + int queue_index = -1; + + if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) + return queue_index; + + tci *= dev_maps->num_tc; + tci += tc; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale( + skb_get_hash(skb), map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + return queue_index; +} +#endif + +static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, + struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct sock *sk = skb->sk; + int queue_index = -1; + + if (!static_key_false(&xps_needed)) + return -1; + + rcu_read_lock(); + if (!static_key_false(&xps_rxqs_needed)) + goto get_cpus_map; + + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); + if (dev_maps) { + int tci = sk_rx_queue_get(sk); + + if (tci >= 0) + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } + +get_cpus_map: + if (queue_index < 0) { + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); + if (dev_maps) { + unsigned int tci = skb->sender_cpu - 1; + + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + return 0; +} +EXPORT_SYMBOL(dev_pick_tx_zero); + +int sk_tx_queue_get(const struct sock *sk) +{ + int resel, val; + + if (!sk) + return -1; + /* Paired with WRITE_ONCE() in sk_tx_queue_clear() + * and sk_tx_queue_set(). + */ + val = READ_ONCE(sk->sk_tx_queue_mapping); + + if (val == NO_QUEUE_MAPPING) + return -1; + + if (!sk_fullsock(sk)) + return val; + + resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection); + if (resel && time_is_before_jiffies( + READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel)) + return -1; + + return val; +} +EXPORT_SYMBOL(sk_tx_queue_get); + +u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + struct sock *sk = skb->sk; + int queue_index = sk_tx_queue_get(sk); + + sb_dev = sb_dev ? : dev; + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int new_index = get_xps_queue(dev, sb_dev, skb); + + if (new_index < 0) + new_index = skb_tx_hash(dev, sb_dev, skb); + + if (sk && sk_fullsock(sk) && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, new_index); + + queue_index = new_index; + } + + return queue_index; +} +EXPORT_SYMBOL(netdev_pick_tx); + +struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, + struct sk_buff *skb, + struct net_device *sb_dev) +{ + int queue_index = 0; + +#ifdef CONFIG_XPS + u32 sender_cpu = skb->sender_cpu - 1; + + if (sender_cpu >= (u32)NR_CPUS) + skb->sender_cpu = raw_smp_processor_id() + 1; +#endif + + if (dev->real_num_tx_queues != 1) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb, sb_dev); + else + queue_index = netdev_pick_tx(dev, skb, sb_dev); + + queue_index = netdev_cap_txqueue(dev, queue_index); + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + /** - * dev_queue_xmit - transmit a buffer - * @skb: buffer to transmit + * __dev_queue_xmit() - transmit a buffer + * @skb: buffer to transmit + * @sb_dev: suboordinate device used for L2 forwarding offload * - * Queue a buffer for transmission to a network device. The caller must - * have set the device and priority and built the buffer before calling - * this function. The function can be called from an interrupt. + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. * - * A negative errno code is returned on a failure. A success does not - * guarantee the frame will be transmitted as it may be dropped due - * to congestion or traffic shaping. + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. * - * ----------------------------------------------------------------------------------- - * I notice this method can also return errors from the queue disciplines, - * including NET_XMIT_DROP, which is a positive value. So, errors can also - * be positive. + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) * - * Regardless of the return value, the skb is consumed, so it is currently - * difficult to retry a send to this method. (You can bump the ref count - * before sending to hold a reference for retry if you are careful.) - * - * When calling this method, interrupts MUST be enabled. This is because - * the BH enable code must have IRQs enabled so that it will not deadlock. - * --BLG + * Return: + * * 0 - buffer successfully transmitted + * * positive qdisc return code - NET_XMIT_DROP etc. + * * negative errno - other errors */ -int dev_queue_xmit(struct sk_buff *skb) +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; - struct netdev_queue *txq; + struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; + bool again = false; skb_reset_mac_header(skb); + skb_assert_len(skb); + + if (unlikely(skb_shinfo(skb)->tx_flags & + (SKBTX_SCHED_TSTAMP | SKBTX_BPF))) + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. @@ -2816,12 +4743,41 @@ int dev_queue_xmit(struct sk_buff *skb) skb_update_prio(skb); - txq = netdev_pick_tx(dev, skb); - q = rcu_dereference_bh(txq->qdisc); + qdisc_pkt_len_segs_init(skb); + tcx_set_ingress(skb, false); +#ifdef CONFIG_NET_EGRESS + if (static_branch_unlikely(&egress_needed_key)) { + if (nf_hook_egress_active()) { + skb = nf_hook_egress(skb, &rc, dev); + if (!skb) + goto out; + } -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); + netdev_xmit_skip_txqueue(false); + + nf_skip_egress(skb, true); + skb = sch_handle_egress(skb, &rc, dev); + if (!skb) + goto out; + nf_skip_egress(skb, false); + + if (netdev_xmit_txqueue_skipped()) + txq = netdev_tx_queue_mapping(dev, skb); + } #endif + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + + if (!txq) + txq = netdev_core_pick_tx(dev, skb, sb_dev); + + q = rcu_dereference_bh(txq->qdisc); + trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); @@ -2829,31 +4785,37 @@ int dev_queue_xmit(struct sk_buff *skb) } /* The device has no queue. Common case for software devices: - loopback, all the sorts of tunnels... + * loopback, all the sorts of tunnels... - Really, it is unlikely that netif_tx_lock protection is necessary - here. (f.e. loopback and IP tunnels are clean ignoring statistics - counters.) - However, it is possible, that they rely on protection - made by us here. + * Really, it is unlikely that netif_tx_lock protection is necessary + * here. (f.e. loopback and IP tunnels are clean ignoring statistics + * counters.) + * However, it is possible, that they rely on protection + * made by us here. - Check this and shot the lock. It is not prone from deadlocks. - Either shot noqueue qdisc, it is even simpler 8) + * Check this and shot the lock. It is not prone from deadlocks. + *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - if (txq->xmit_lock_owner != cpu) { - - if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + if (READ_ONCE(txq->xmit_lock_owner) != cpu) { + if (dev_xmit_recursion()) goto recursion_alert; + skb = validate_xmit_skb(skb, dev, &again); + if (!skb) + goto out; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { - __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); - __this_cpu_dec(xmit_recursion); + dev_xmit_recursion_inc(); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); + dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; @@ -2875,52 +4837,153 @@ recursion_alert: rc = -ENETDOWN; rcu_read_unlock_bh(); - kfree_skb(skb); + dev_core_stats_tx_dropped_inc(dev); + kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } -EXPORT_SYMBOL(dev_queue_xmit); +EXPORT_SYMBOL(__dev_queue_xmit); +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ + struct net_device *dev = skb->dev; + struct sk_buff *orig_skb = skb; + struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; + bool again = false; + + if (unlikely(!netif_running(dev) || + !netif_carrier_ok(dev))) + goto drop; + + skb = validate_xmit_skb_list(skb, dev, &again); + if (skb != orig_skb) + goto drop; + + skb_set_queue_mapping(skb, queue_id); + txq = skb_get_tx_queue(dev, skb); -/*======================================================================= - Receiver routines - =======================================================================*/ + local_bh_disable(); -int netdev_max_backlog __read_mostly = 1000; -EXPORT_SYMBOL(netdev_max_backlog); + dev_xmit_recursion_inc(); + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq, false); + HARD_TX_UNLOCK(dev, txq); + dev_xmit_recursion_dec(); -int netdev_tstamp_prequeue __read_mostly = 1; -int netdev_budget __read_mostly = 300; -int weight_p __read_mostly = 64; /* old backlog weight */ + local_bh_enable(); + return ret; +drop: + dev_core_stats_tx_dropped_inc(dev); + kfree_skb_list(skb); + return NET_XMIT_DROP; +} +EXPORT_SYMBOL(__dev_direct_xmit); + +/************************************************************************* + * Receiver routines + *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); + +int weight_p __read_mostly = 64; /* old backlog weight */ +int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ +int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { + struct task_struct *thread; + + lockdep_assert_irqs_disabled(); + + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { + /* Paired with smp_mb__before_atomic() in + * napi_enable()/netif_set_threaded(). + * Use READ_ONCE() to guarantee a complete + * read on napi->thread. Only call + * wake_up_process() when it's not NULL. + */ + thread = READ_ONCE(napi->thread); + if (thread) { + if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) + goto use_local_napi; + + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + wake_up_process(thread); + return; + } + } + +use_local_napi: + DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list)); list_add_tail(&napi->poll_list, &sd->poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + WRITE_ONCE(napi->list_owner, smp_processor_id()); + /* If not called from net_rx_action() + * we have to raise NET_RX_SOFTIRQ. + */ + if (!sd->in_net_rx_action) + raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); +struct static_key_false rps_needed __read_mostly; +EXPORT_SYMBOL(rps_needed); +struct static_key_false rfs_needed __read_mostly; +EXPORT_SYMBOL(rfs_needed); -struct static_key rps_needed __read_mostly; +static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +{ + return hash_32(hash, flow_table->log); +} + +#ifdef CONFIG_RFS_ACCEL +/** + * rps_flow_is_active - check whether the flow is recently active. + * @rflow: Specific flow to check activity. + * @flow_table: per-queue flowtable that @rflow belongs to. + * @cpu: CPU saved in @rflow. + * + * If the CPU has processed many packets since the flow's last activity + * (beyond 10 times the table size), the flow is considered stale. + * + * Return: true if flow was recently active. + */ +static bool rps_flow_is_active(struct rps_dev_flow *rflow, + struct rps_dev_flow_table *flow_table, + unsigned int cpu) +{ + unsigned int flow_last_active; + unsigned int sd_input_head; + + if (cpu >= nr_cpu_ids) + return false; + + sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head); + flow_last_active = READ_ONCE(rflow->last_qtail); + + return (int)(sd_input_head - flow_last_active) < + (int)(10 << flow_table->log); +} +#endif static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow *rflow, u16 next_cpu) + struct rps_dev_flow *rflow, u16 next_cpu, u32 hash, + u32 flow_id) { - if (next_cpu != RPS_NO_CPU) { + if (next_cpu < nr_cpu_ids) { + u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; - u32 flow_id; + struct rps_dev_flow *tmp_rflow; + unsigned int tmp_cpu; u16 rxq_index; int rc; @@ -2936,23 +4999,38 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb->rxhash & flow_table->mask; + + tmp_rflow = &flow_table->flows[flow_id]; + tmp_cpu = READ_ONCE(tmp_rflow->cpu); + + if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { + if (rps_flow_is_active(tmp_rflow, flow_table, + tmp_cpu)) { + if (hash != READ_ONCE(tmp_rflow->hash) || + next_cpu == tmp_cpu) + goto out; + } + } + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; + old_rflow = rflow; - rflow = &flow_table->flows[flow_id]; - rflow->filter = rc; - if (old_rflow->filter == rflow->filter) - old_rflow->filter = RPS_NO_FILTER; + rflow = tmp_rflow; + WRITE_ONCE(rflow->filter, rc); + WRITE_ONCE(rflow->hash, hash); + + if (old_rflow->filter == rc) + WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif - rflow->last_qtail = - per_cpu(softnet_data, next_cpu).input_queue_head; + head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); + rps_input_queue_tail_save(&rflow->last_qtail, head); } - rflow->cpu = next_cpu; + WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } @@ -2964,15 +5042,18 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { - struct netdev_rx_queue *rxqueue; - struct rps_map *map; + const struct rps_sock_flow_table *sock_flow_table; + struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; + struct rps_map *map; int cpu = -1; - u16 tcpu; + u32 flow_id; + u32 tcpu; + u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " @@ -2980,44 +5061,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, dev->name, index, dev->real_num_rx_queues); goto done; } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; + rxqueue += index; + } + /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ + + flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); - if (map) { - if (map->len == 1 && - !rcu_access_pointer(rxqueue->rps_flow_table)) { - tcpu = map->cpus[0]; - if (cpu_online(tcpu)) - cpu = tcpu; - goto done; - } - } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { + if (!flow_table && !map) goto done; - } skb_reset_network_header(skb); - if (!skb_get_rxhash(skb)) + hash = skb_get_hash(skb); + if (!hash) goto done; - flow_table = rcu_dereference(rxqueue->rps_flow_table); - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { - u16 next_cpu; struct rps_dev_flow *rflow; + u32 next_cpu; + u32 ident; - rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; - tcpu = rflow->cpu; + /* First check into global flow table if there is a match. + * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). + */ + ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); + if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) + goto try_rps; + + next_cpu = ident & net_hotdata.rps_cpu_mask; - next_cpu = sock_flow_table->ents[skb->rxhash & - sock_flow_table->mask]; + /* OK, now we know there is a match, + * we can look at the local (per receive queue) flow table + */ + flow_id = rfs_slot(hash, flow_table); + rflow = &flow_table->flows[flow_id]; + tcpu = rflow->cpu; /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: - * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. @@ -3025,23 +5110,25 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && - (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || - ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || + ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; - rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash, + flow_id); } - if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; } } - if (map) { - tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; +try_rps: + if (map) { + tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; @@ -3072,17 +5159,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id <= flow_table->mask) { + if (flow_table && flow_id < (1UL << flow_table->log)) { + unsigned int cpu; + rflow = &flow_table->flows[flow_id]; - cpu = ACCESS_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu != RPS_NO_CPU && - ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < - (int)(10 * flow_table->mask))) + cpu = READ_ONCE(rflow->cpu); + if (READ_ONCE(rflow->filter) == filter_id && + rps_flow_is_active(rflow, flow_table, cpu)) expire = false; } rcu_read_unlock(); @@ -3098,52 +5184,95 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); - sd->received_rps++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->received_rps, sd->received_rps + 1); } #endif /* CONFIG_RPS */ +/* Called from hardirq (IPI) context */ +static void trigger_rx_softirq(void *data) +{ + struct softnet_data *sd = data; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); +} + /* - * Check if this softnet_data structure is another cpu one - * If yes, queue it to our IPI list and return 1 - * If no, return 0 + * After we queued a packet into sd->input_pkt_queue, + * we need to make sure this queue is serviced soon. + * + * - If this is another cpu queue, link it to our rps_ipi_list, + * and make sure we will process rps_ipi_list from net_rx_action(). + * + * - If this is our own queue, NAPI schedule our backlog. + * Note that this also raises NET_RX_SOFTIRQ. */ -static int rps_ipi_queued(struct softnet_data *sd) +static void napi_schedule_rps(struct softnet_data *sd) { -#ifdef CONFIG_RPS - struct softnet_data *mysd = &__get_cpu_var(softnet_data); + struct softnet_data *mysd = this_cpu_ptr(&softnet_data); +#ifdef CONFIG_RPS if (sd != mysd) { + if (use_backlog_threads()) { + __napi_schedule_irqoff(&sd->backlog); + return; + } + sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - return 1; + /* If not called from net_rx_action() or napi_threaded_poll() + * we have to raise NET_RX_SOFTIRQ. + */ + if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return; } #endif /* CONFIG_RPS */ - return 0; + __napi_schedule_irqoff(&mysd->backlog); +} + +void kick_defer_list_purge(unsigned int cpu) +{ + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + unsigned long flags; + + if (use_backlog_threads()) { + backlog_lock_irq_save(sd, &flags); + + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + __napi_schedule_irqoff(&sd->backlog); + + backlog_unlock_irq_restore(sd, &flags); + + } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { + smp_call_function_single_async(cpu, &sd->defer_csd); + } } #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif -static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen, + int max_backlog) { #ifdef CONFIG_NET_FLOW_LIMIT - struct sd_flow_limit *fl; - struct softnet_data *sd; unsigned int old_flow, new_flow; + const struct softnet_data *sd; + struct sd_flow_limit *fl; - if (qlen < (netdev_max_backlog >> 1)) + if (likely(qlen < (max_backlog >> 1))) return false; - sd = &__get_cpu_var(softnet_data); + sd = this_cpu_ptr(&softnet_data); rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); + new_flow = hash_32(skb_get_hash(skb), fl->log_buckets); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; @@ -3154,7 +5283,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { - fl->count++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(fl->count, fl->count + 1); rcu_read_unlock(); return true; } @@ -3171,78 +5301,335 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { + enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; + u32 tail; - sd = &per_cpu(softnet_data, cpu); + reason = SKB_DROP_REASON_DEV_READY; + if (unlikely(!netif_running(skb->dev))) + goto bad_dev; - local_irq_save(flags); + sd = &per_cpu(softnet_data, cpu); - rps_lock(sd); + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog) || + skb_flow_limit(skb, qlen, max_backlog)) + goto cpu_backlog_drop; + backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { - if (skb_queue_len(&sd->input_pkt_queue)) { -enqueue: - __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); - rps_unlock(sd); - local_irq_restore(flags); - return NET_RX_SUCCESS; - } - - /* Schedule NAPI for backlog device - * We can use non atomic operation since we own the queue lock - */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { - if (!rps_ipi_queued(sd)) - ____napi_schedule(sd, &sd->backlog); + if (likely(qlen <= max_backlog)) { + if (!qlen) { + /* Schedule NAPI for backlog device. We can use + * non atomic operation as we own the queue lock. + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, + &sd->backlog.state)) + napi_schedule_rps(sd); } - goto enqueue; - } + __skb_queue_tail(&sd->input_pkt_queue, skb); + tail = rps_input_queue_tail_incr(sd); + backlog_unlock_irq_restore(sd, &flags); - sd->dropped++; - rps_unlock(sd); + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); + return NET_RX_SUCCESS; + } - local_irq_restore(flags); + backlog_unlock_irq_restore(sd, &flags); - atomic_long_inc(&skb->dev->rx_dropped); - kfree_skb(skb); +cpu_backlog_drop: + reason = SKB_DROP_REASON_CPU_BACKLOG; + numa_drop_add(&sd->drop_counters, 1); +bad_dev: + dev_core_stats_rx_dropped_inc(skb->dev); + kfree_skb_reason(skb, reason); return NET_RX_DROP; } -/** - * netif_rx - post buffer to the network code - * @skb: buffer to post - * - * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. - * - * return values: - * NET_RX_SUCCESS (no congestion) - * NET_RX_DROP (packet was dropped) - * +static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_rx_queue *rxqueue; + + rxqueue = dev->_rx; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + + return rxqueue; /* Return first rxqueue */ + } + rxqueue += index; + } + return rxqueue; +} + +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) +{ + void *orig_data, *orig_data_end, *hard_start; + struct netdev_rx_queue *rxqueue; + bool orig_bcast, orig_host; + u32 mac_len, frame_sz; + __be16 orig_eth_type; + struct ethhdr *eth; + u32 metalen, act; + int off; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + frame_sz = (void *)skb_end_pointer(skb) - hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + rxqueue = netif_get_rxqueue(skb); + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); + xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, + skb_headlen(skb) + mac_len, true); + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } + + orig_data_end = xdp->data_end; + orig_data = xdp->data; + eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); + orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); + orig_eth_type = eth->h_proto; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + + /* check if bpf_xdp_adjust_head was used */ + off = xdp->data - orig_data; + if (off) { + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + skb->mac_header += off; + skb_reset_network_header(skb); + } + + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; + if (off != 0) { + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); + skb->len += off; /* positive on grow, negative on shrink */ + } + + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + + /* check if XDP changed eth hdr such SKB needs update */ + eth = (struct ethhdr *)xdp->data; + if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || + (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { + __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + } + + /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull + * before calling us again on redirect path. We do not call do_redirect + * as we leave that up to the caller. + * + * Caller is responsible for managing lifetime of skb (i.e. calling + * kfree_skb in response to actions it cannot handle/XDP_DROP). + */ + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + __skb_push(skb, mac_len); + break; + case XDP_PASS: + metalen = xdp->data - xdp->data_meta; + if (metalen) + skb_metadata_set(skb, metalen); + break; + } + + return act; +} + +static int +netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) +{ + struct sk_buff *skb = *pskb; + int err, hroom, troom; + + local_lock_nested_bh(&system_page_pool.bh_lock); + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); + local_unlock_nested_bh(&system_page_pool.bh_lock); + if (!err) + return 0; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + troom = skb->tail + skb->data_len - skb->end; + err = pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC); + if (err) + return err; + + return skb_linearize(skb); +} + +static u32 netif_receive_generic_xdp(struct sk_buff **pskb, + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) +{ + struct sk_buff *skb = *pskb; + u32 mac_len, act = XDP_DROP; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_is_redirected(skb)) + return XDP_PASS; + + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. + */ + mac_len = skb->data - skb_mac_header(skb); + __skb_push(skb, mac_len); + + if (skb_cloned(skb) || skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + if (netif_skb_check_for_xdp(pskb, xdp_prog)) + goto do_drop; + } + + __skb_pull(*pskb, mac_len); + + act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + case XDP_PASS: + break; + default: + bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception((*pskb)->dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + do_drop: + kfree_skb(*pskb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. This also means + * that XDP packets are able to starve other packets going through a qdisc, + * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX + * queues, so they do not have this starvation issue. */ +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; -int netif_rx(struct sk_buff *skb) + txq = netdev_core_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_frozen_or_drv_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + dev_core_stats_tx_dropped_inc(dev); + kfree_skb(skb); + } +} + +static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); + +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { - int ret; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + + if (xdp_prog) { + struct xdp_buff xdp; + u32 act; + int err; - /* if netpoll wants it, pretend we never saw it */ - if (netpoll_rx(skb)) - return NET_RX_DROP; + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); + if (act != XDP_PASS) { + switch (act) { + case XDP_REDIRECT: + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, + &xdp, xdp_prog); + if (err) + goto out_redir; + break; + case XDP_TX: + generic_xdp_tx(*pskb, xdp_prog); + break; + } + bpf_net_ctx_clear(bpf_net_ctx); + return XDP_DROP; + } + bpf_net_ctx_clear(bpf_net_ctx); + } + return XDP_PASS; +out_redir: + bpf_net_ctx_clear(bpf_net_ctx); + kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); + return XDP_DROP; +} +EXPORT_SYMBOL_GPL(do_xdp_generic); + +static int netif_rx_internal(struct sk_buff *skb) +{ + int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); + #ifdef CONFIG_RPS - if (static_key_false(&rps_needed)) { + if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -3252,35 +5639,75 @@ int netif_rx(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + + ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } -EXPORT_SYMBOL(netif_rx); -int netif_rx_ni(struct sk_buff *skb) +/** + * __netif_rx - Slightly optimized version of netif_rx + * @skb: buffer to post + * + * This behaves as netif_rx except that it does not disable bottom halves. + * As a result this function may only be invoked from the interrupt context + * (either hard or soft interrupt). + */ +int __netif_rx(struct sk_buff *skb) { - int err; + int ret; - preempt_disable(); - err = netif_rx(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); + lockdep_assert_once(hardirq_count() | softirq_count()); - return err; + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + return ret; } -EXPORT_SYMBOL(netif_rx_ni); +EXPORT_SYMBOL(__netif_rx); + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process via the backlog NAPI device. It + * always succeeds. The buffer may be dropped during processing for + * congestion control or by the protocol layers. + * The network buffer is passed via the backlog NAPI device. Modern NIC + * driver should use NAPI and GRO. + * This function can used from interrupt and from process context. The + * caller from process context must not disable interrupts before invoking + * this function. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ +int netif_rx(struct sk_buff *skb) +{ + bool need_bh_off = !(hardirq_count() | softirq_count()); + int ret; + + if (need_bh_off) + local_bh_disable(); + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + if (need_bh_off) + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL(netif_rx); -static void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(void) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; @@ -3292,11 +5719,21 @@ static void net_tx_action(struct softirq_action *h) while (clist) { struct sk_buff *skb = clist; + clist = clist->next; - WARN_ON(atomic_read(&skb->users)); - trace_kfree_skb(skb, net_tx_action); - __kfree_skb(skb); + WARN_ON(refcount_read(&skb->users)); + if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED)) + trace_consume_skb(skb, net_tx_action); + else + trace_kfree_skb(skb, net_tx_action, + get_kfree_skb_cb(skb)->reason, NULL); + + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __napi_kfree_skb(skb, + get_kfree_skb_cb(skb)->reason); } } @@ -3309,103 +5746,72 @@ static void net_tx_action(struct softirq_action *h) sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); + rcu_read_lock(); + while (head) { + spinlock_t *root_lock = NULL; + struct sk_buff *to_free; struct Qdisc *q = head; - spinlock_t *root_lock; head = head->next_sched; - root_lock = qdisc_lock(q); - if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - qdisc_run(q); - spin_unlock(root_lock); - } else { - if (!test_bit(__QDISC_STATE_DEACTIVATED, - &q->state)) { - __netif_reschedule(q); - } else { - smp_mb__before_clear_bit(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - } + /* We need to make sure head->next_sched is read + * before clearing __QDISC_STATE_SCHED + */ + smp_mb__before_atomic(); + + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, + &q->state))) { + /* There is a synchronize_net() between + * STATE_DEACTIVATED flag being set and + * qdisc_reset()/some_qdisc_is_busy() in + * dev_deactivate(), so we can safely bail out + * early here to avoid data race between + * qdisc_deactivate() and some_qdisc_is_busy() + * for lockless qdisc. + */ + clear_bit(__QDISC_STATE_SCHED, &q->state); + continue; } + + clear_bit(__QDISC_STATE_SCHED, &q->state); + to_free = qdisc_run(q); + if (root_lock) + spin_unlock(root_lock); + tcf_kfree_skb_list(to_free); } + + rcu_read_unlock(); } + + xfrm_dev_backlog(sd); } -#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ - (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) +#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -#ifdef CONFIG_NET_CLS_ACT -/* TODO: Maybe we should just force sch_ingress to be compiled in - * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions - * a compare and 2 stores extra right now if we dont have it on - * but have CONFIG_NET_CLS_ACT - * NOTE: This doesn't stop any functionality; if you dont have - * the ingress scheduler, you just can't add policies on ingress. +/** + * netdev_is_rx_handler_busy - check if receive handler is registered + * @dev: device to check * + * Check if a receive handler is already registered for a given device. + * Return true if there one. + * + * The caller must hold the rtnl_mutex. */ -static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) -{ - struct net_device *dev = skb->dev; - u32 ttl = G_TC_RTTL(skb->tc_verd); - int result = TC_ACT_OK; - struct Qdisc *q; - - if (unlikely(MAX_RED_LOOP < ttl++)) { - net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", - skb->skb_iif, dev->ifindex); - return TC_ACT_SHOT; - } - - skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - - q = rxq->qdisc; - if (q != &noop_qdisc) { - spin_lock(qdisc_lock(q)); - if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - result = qdisc_enqueue_root(skb, q); - spin_unlock(qdisc_lock(q)); - } - - return result; -} - -static inline struct sk_buff *handle_ing(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, struct net_device *orig_dev) +bool netdev_is_rx_handler_busy(struct net_device *dev) { - struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - - if (!rxq || rxq->qdisc == &noop_qdisc) - goto out; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - switch (ing_filter(skb, rxq)) { - case TC_ACT_SHOT: - case TC_ACT_STOLEN: - kfree_skb(skb); - return NULL; - } - -out: - skb->tc_verd = 0; - return skb; + ASSERT_RTNL(); + return dev && rtnl_dereference(dev->rx_handler); } -#endif +EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); /** * netdev_rx_handler_register - register receive handler @@ -3413,7 +5819,7 @@ out: * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * - * Register a receive hander for a device. This handler will then be + * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * @@ -3425,11 +5831,12 @@ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { - ASSERT_RTNL(); - - if (dev->rx_handler) + if (netdev_is_rx_handler_busy(dev)) return -EBUSY; + if (dev->priv_flags & IFF_NO_RX_HANDLER) + return -EINVAL; + /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); @@ -3467,112 +5874,162 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_ARP): - case __constant_htons(ETH_P_IP): - case __constant_htons(ETH_P_IPV6): - case __constant_htons(ETH_P_8021Q): - case __constant_htons(ETH_P_8021AD): + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): return true; default: return false; } } -static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + if (nf_hook_ingress_active(skb)) { + int ingress_retval; + + if (unlikely(*pt_prev)) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + rcu_read_lock(); + ingress_retval = nf_hook_ingress(skb); + rcu_read_unlock(); + return ingress_retval; + } + return 0; +} + +static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, + struct packet_type **ppt_prev) { + enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO; struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; + struct sk_buff *skb = *pskb; struct net_device *orig_dev; - struct net_device *null_or_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!netdev_tstamp_prequeue, skb); + net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); - /* if we've gotten here through NAPI, check netpoll */ - if (netpoll_receive_skb(skb)) - goto out; - orig_dev = skb->dev; skb_reset_network_header(skb); +#if !defined(CONFIG_DEBUG_NET) + /* We plan to no longer reset the transport header here. + * Give some time to fuzzers and dev build to catch bugs + * in network stacks. + */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); +#endif skb_reset_mac_len(skb); pt_prev = NULL; - rcu_read_lock(); - another_round: skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); - if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || - skb->protocol == cpu_to_be16(ETH_P_8021AD)) { - skb = vlan_untag(skb); - if (unlikely(!skb)) - goto unlock; + if (static_branch_unlikely(&generic_xdp_needed_key)) { + int ret2; + + migrate_disable(); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + &skb); + migrate_enable(); + + if (ret2 != XDP_PASS) { + ret = NET_RX_DROP; + goto out; + } } -#ifdef CONFIG_NET_CLS_ACT - if (skb->tc_verd & TC_NCLS) { - skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - goto ncls; + if (eth_type_vlan(skb->protocol)) { + skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + goto out; } -#endif + + if (skb_skip_tc_classify(skb)) + goto skip_classify; if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } + list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, + list) { + if (unlikely(pt_prev)) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + + list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { + if (unlikely(pt_prev)) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; } skip_taps: -#ifdef CONFIG_NET_CLS_ACT - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto unlock; -ncls: -#endif +#ifdef CONFIG_NET_INGRESS + if (static_branch_unlikely(&ingress_needed_key)) { + bool another = false; + + nf_skip_egress(skb, true); + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; + if (!skb) + goto out; - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) + nf_skip_egress(skb, false); + if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) + goto out; + } +#endif + skb_reset_redirect(skb); +skip_classify: + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) { + drop_reason = SKB_DROP_REASON_PFMEMALLOC; goto drop; + } - if (vlan_tx_tag_present(skb)) { - if (pt_prev) { + if (skb_vlan_tag_present(skb)) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) - goto unlock; + goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { - if (pt_prev) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; - goto unlock; + goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; + break; case RX_HANDLER_PASS: break; default: @@ -3580,51 +6037,202 @@ ncls: } } - if (vlan_tx_nonzero_tag_present(skb)) - skb->pkt_type = PACKET_OTHERHOST; + if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { +check_vlan_id: + if (skb_vlan_tag_get_id(skb)) { + /* Vlan id is non 0 and vlan_do_receive() above couldn't + * find vlan device. + */ + skb->pkt_type = PACKET_OTHERHOST; + } else if (eth_type_vlan(skb->protocol)) { + /* Outer header is 802.1P with vlan 0, inner header is + * 802.1Q or 802.1AD and vlan_do_receive() above could + * not find vlan dev for vlan id 0. + */ + __vlan_hwaccel_clear_tag(skb); + skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + goto out; + if (vlan_do_receive(&skb)) + /* After stripping off 802.1P header with vlan 0 + * vlan dev is found for inner header. + */ + goto another_round; + else if (unlikely(!skb)) + goto out; + else + /* We have stripped outer 802.1P vlan 0 header. + * But could not find vlan dev. + * check again for vlan id to set OTHERHOST. + */ + goto check_vlan_id; + } + /* Note: we might in the future use prio bits + * and set skb->priority like in vlan_do_receive() + * For the time being, just ignore Priority Code Point + */ + __vlan_hwaccel_clear_tag(skb); + } + + type = skb->protocol; /* deliver only exact match when indicated */ - null_or_dev = deliver_exact ? skb->dev : NULL; + if (likely(!deliver_exact)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &ptype_base[ntohs(type) & + PTYPE_HASH_MASK]); + + /* orig_dev and skb->dev could belong to different netns; + * Even in such case we need to traverse only the list + * coming from skb->dev, as the ptype owner (packet socket) + * will use dev_net(skb->dev) to do namespace filtering. + */ + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &dev_net_rcu(skb->dev)->ptype_specific); + } - type = skb->protocol; - list_for_each_entry_rcu(ptype, - &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && - (ptype->dev == null_or_dev || ptype->dev == skb->dev || - ptype->dev == orig_dev)) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &orig_dev->ptype_specific); + + if (unlikely(skb->dev != orig_dev)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &skb->dev->ptype_specific); } if (pt_prev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) - goto drop; - else - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + *ppt_prev = pt_prev; } else { drop: - atomic_long_inc(&skb->dev->rx_dropped); - kfree_skb(skb); + if (!deliver_exact) + dev_core_stats_rx_dropped_inc(skb->dev); + else + dev_core_stats_rx_nohandler_inc(skb->dev); + + kfree_skb_reason(skb, drop_reason); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ ret = NET_RX_DROP; } -unlock: - rcu_read_unlock(); out: + /* The invariant here is that if *ppt_prev is not NULL + * then skb should also be non-NULL. + * + * Apparently *ppt_prev assignment above holds this invariant due to + * skb dereferencing near it. + */ + *pskb = skb; + return ret; +} + +static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) +{ + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + int ret; + + ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); + if (pt_prev) + ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, + skb->dev, pt_prev, orig_dev); return ret; } +/** + * netif_receive_skb_core - special purpose version of netif_receive_skb + * @skb: buffer to process + * + * More direct receive version of netif_receive_skb(). It should + * only be used by callers that have a need to skip RPS and Generic XDP. + * Caller must also take care of handling if ``(page_is_)pfmemalloc``. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb_core(struct sk_buff *skb) +{ + int ret; + + rcu_read_lock(); + ret = __netif_receive_skb_one_core(skb, false); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(netif_receive_skb_core); + +static inline void __netif_receive_skb_list_ptype(struct list_head *head, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + struct sk_buff *skb, *next; + + if (!pt_prev) + return; + if (list_empty(head)) + return; + if (pt_prev->list_func != NULL) + INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, + ip_list_rcv, head, pt_prev, orig_dev); + else + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + } +} + +static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +{ + /* Fast-path assumptions: + * - There is no RX handler. + * - Only one packet_type matches. + * If either of these fails, we will end up doing some per-packet + * processing in-line, then handling the 'last ptype' for the whole + * sublist. This can't cause out-of-order delivery to any single ptype, + * because the 'last ptype' must be constant across the sublist, and all + * other ptypes are handled per-packet. + */ + /* Current (common) ptype of sublist */ + struct packet_type *pt_curr = NULL; + /* Current (common) orig_dev of sublist */ + struct net_device *od_curr = NULL; + struct sk_buff *skb, *next; + LIST_HEAD(sublist); + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + + skb_list_del_init(skb); + __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); + if (!pt_prev) + continue; + if (pt_curr != pt_prev || od_curr != orig_dev) { + /* dispatch old sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + pt_curr = pt_prev; + od_curr = orig_dev; + } + list_add_tail(&skb->list, &sublist); + } + + /* dispatch final sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); +} + static int __netif_receive_skb(struct sk_buff *skb) { int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should @@ -3635,15 +6243,134 @@ static int __netif_receive_skb(struct sk_buff *skb) * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ - current->flags |= PF_MEMALLOC; - ret = __netif_receive_skb_core(skb, true); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + noreclaim_flag = memalloc_noreclaim_save(); + ret = __netif_receive_skb_one_core(skb, true); + memalloc_noreclaim_restore(noreclaim_flag); } else - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); return ret; } +static void __netif_receive_skb_list(struct list_head *head) +{ + unsigned long noreclaim_flag = 0; + struct sk_buff *skb, *next; + bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ + + list_for_each_entry_safe(skb, next, head, list) { + if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { + struct list_head sublist; + + /* Handle the previous sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + __netif_receive_skb_list_core(&sublist, pfmemalloc); + pfmemalloc = !pfmemalloc; + /* See comments in __netif_receive_skb */ + if (pfmemalloc) + noreclaim_flag = memalloc_noreclaim_save(); + else + memalloc_noreclaim_restore(noreclaim_flag); + } + } + /* Handle the remaining sublist */ + if (!list_empty(head)) + __netif_receive_skb_list_core(head, pfmemalloc); + /* Restore pflags */ + if (pfmemalloc) + memalloc_noreclaim_restore(noreclaim_flag); +} + +static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) +{ + struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); + struct bpf_prog *new = xdp->prog; + int ret = 0; + + switch (xdp->command) { + case XDP_SETUP_PROG: + rcu_assign_pointer(dev->xdp_prog, new); + if (old) + bpf_prog_put(old); + + if (old && !new) { + static_branch_dec(&generic_xdp_needed_key); + } else if (new && !old) { + static_branch_inc(&generic_xdp_needed_key); + netif_disable_lro(dev); + dev_disable_gro_hw(dev); + } + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int netif_receive_skb_internal(struct sk_buff *skb) +{ + int ret; + + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); + + if (skb_defer_rx_timestamp(skb)) + return NET_RX_SUCCESS; + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rps_needed)) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + return ret; + } + } +#endif + ret = __netif_receive_skb(skb); + rcu_read_unlock(); + return ret; +} + +void netif_receive_skb_list_internal(struct list_head *head) +{ + struct sk_buff *skb, *next; + LIST_HEAD(sublist); + + list_for_each_entry_safe(skb, next, head, list) { + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), + skb); + skb_list_del_init(skb); + if (!skb_defer_rx_timestamp(skb)) + list_add_tail(&skb->list, &sublist); + } + list_splice_init(&sublist, head); + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rps_needed)) { + list_for_each_entry_safe(skb, next, head, list) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + /* Will be handled, remove from list */ + skb_list_del_init(skb); + enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + } + } + } +#endif + __netif_receive_skb_list(head); + rcu_read_unlock(); +} + /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process @@ -3661,758 +6388,1565 @@ static int __netif_receive_skb(struct sk_buff *skb) */ int netif_receive_skb(struct sk_buff *skb) { - net_timestamp_check(netdev_tstamp_prequeue, skb); + int ret; - if (skb_defer_rx_timestamp(skb)) - return NET_RX_SUCCESS; + trace_netif_receive_skb_entry(skb); -#ifdef CONFIG_RPS - if (static_key_false(&rps_needed)) { - struct rps_dev_flow voidflow, *rflow = &voidflow; - int cpu, ret; + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); - rcu_read_lock(); + return ret; +} +EXPORT_SYMBOL(netif_receive_skb); - cpu = get_rps_cpu(skb->dev, skb, &rflow); +/** + * netif_receive_skb_list - process many receive buffers from network + * @head: list of skbs to process. + * + * Since return value of netif_receive_skb() is normally ignored, and + * wouldn't be meaningful for a list, this function returns void. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + */ +void netif_receive_skb_list(struct list_head *head) +{ + struct sk_buff *skb; - if (cpu >= 0) { - ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); - rcu_read_unlock(); - return ret; - } - rcu_read_unlock(); + if (list_empty(head)) + return; + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); } -#endif - return __netif_receive_skb(skb); + netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); } -EXPORT_SYMBOL(netif_receive_skb); +EXPORT_SYMBOL(netif_receive_skb_list); -/* Network device is going away, flush any packets still pending - * Called with irqs disabled. - */ -static void flush_backlog(void *arg) +/* Network device is going away, flush any packets still pending */ +static void flush_backlog(struct work_struct *work) { - struct net_device *dev = arg; - struct softnet_data *sd = &__get_cpu_var(softnet_data); struct sk_buff *skb, *tmp; + struct sk_buff_head list; + struct softnet_data *sd; + + __skb_queue_head_init(&list); + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); - rps_lock(sd); + backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev == dev) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - kfree_skb(skb); - input_queue_head_incr(sd); + __skb_queue_tail(&list, skb); + rps_input_queue_head_incr(sd); } } - rps_unlock(sd); + backlog_unlock_irq_enable(sd); + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev == dev) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); - input_queue_head_incr(sd); + __skb_queue_tail(&list, skb); + rps_input_queue_head_incr(sd); } } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); + local_bh_enable(); + + __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } -static int napi_gro_complete(struct sk_buff *skb) +static bool flush_required(int cpu) { - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int err = -ENOENT; +#if IS_ENABLED(CONFIG_RPS) + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + bool do_flush; - BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + backlog_lock_irq_disable(sd); - if (NAPI_GRO_CB(skb)->count == 1) { - skb_shinfo(skb)->gso_size = 0; - goto out; - } + /* as insertion into process_queue happens with the rps lock held, + * process_queue access may race only with dequeue + */ + do_flush = !skb_queue_empty(&sd->input_pkt_queue) || + !skb_queue_empty_lockless(&sd->process_queue); + backlog_unlock_irq_enable(sd); - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_complete) - continue; + return do_flush; +#endif + /* without RPS we can't safely check input_pkt_queue: during a + * concurrent remote skb_queue_splice() we can detect as empty both + * input_pkt_queue and process_queue even if the latter could end-up + * containing a lot of packets. + */ + return true; +} - err = ptype->callbacks.gro_complete(skb); - break; +struct flush_backlogs { + cpumask_t flush_cpus; + struct work_struct w[]; +}; + +static struct flush_backlogs *flush_backlogs_alloc(void) +{ + return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), + GFP_KERNEL); +} + +static struct flush_backlogs *flush_backlogs_fallback; +static DEFINE_MUTEX(flush_backlogs_mutex); + +static void flush_all_backlogs(void) +{ + struct flush_backlogs *ptr = flush_backlogs_alloc(); + unsigned int cpu; + + if (!ptr) { + mutex_lock(&flush_backlogs_mutex); + ptr = flush_backlogs_fallback; } - rcu_read_unlock(); + cpumask_clear(&ptr->flush_cpus); - if (err) { - WARN_ON(&ptype->list == head); - kfree_skb(skb); - return NET_RX_SUCCESS; + cpus_read_lock(); + + for_each_online_cpu(cpu) { + if (flush_required(cpu)) { + INIT_WORK(&ptr->w[cpu], flush_backlog); + queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); + __cpumask_set_cpu(cpu, &ptr->flush_cpus); + } } -out: - return netif_receive_skb(skb); + /* we can have in flight packet[s] on the cpus we are not flushing, + * synchronize_net() in unregister_netdevice_many() will take care of + * them. + */ + for_each_cpu(cpu, &ptr->flush_cpus) + flush_work(&ptr->w[cpu]); + + cpus_read_unlock(); + + if (ptr != flush_backlogs_fallback) + kfree(ptr); + else + mutex_unlock(&flush_backlogs_mutex); } -/* napi->gro_list contains packets ordered by age. - * youngest packets at the head of it. - * Complete skbs in reverse order to reduce latencies. - */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +static void net_rps_send_ipi(struct softnet_data *remsd) { - struct sk_buff *skb, *prev = NULL; +#ifdef CONFIG_RPS + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; - /* scan list and build reverse chain */ - for (skb = napi->gro_list; skb != NULL; skb = skb->next) { - skb->prev = prev; - prev = skb; + if (cpu_online(remsd->cpu)) + smp_call_function_single_async(remsd->cpu, &remsd->csd); + remsd = next; } +#endif +} + +/* + * net_rps_action_and_irq_enable sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; - for (skb = prev; skb; skb = prev) { - skb->next = NULL; + if (!use_backlog_threads() && remsd) { + sd->rps_ipi_list = NULL; - if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) - return; + local_irq_enable(); - prev = skb->prev; - napi_gro_complete(skb); - napi->gro_count--; - } + /* Send pending IPI's to kick RPS processing on remote cpus. */ + net_rps_send_ipi(remsd); + } else +#endif + local_irq_enable(); +} - napi->gro_list = NULL; +static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return !use_backlog_threads() && sd->rps_ipi_list; +#else + return false; +#endif } -EXPORT_SYMBOL(napi_gro_flush); -static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +static int process_backlog(struct napi_struct *napi, int quota) { - struct sk_buff *p; - unsigned int maclen = skb->dev->hard_header_len; + struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + bool again = true; + int work = 0; + + /* Check if we have pending ipi, its better to send them now, + * not waiting net_rx_action() end. + */ + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + + napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); + while (again) { + struct sk_buff *skb; + + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); + while ((skb = __skb_dequeue(&sd->process_queue))) { + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); + rcu_read_lock(); + __netif_receive_skb(skb); + rcu_read_unlock(); + if (++work >= quota) { + rps_input_queue_head_add(sd, work); + return work; + } - for (p = napi->gro_list; p; p = p->next) { - unsigned long diffs; + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); + } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); - diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; - if (maclen == ETH_HLEN) - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); - else if (!diffs) - diffs = memcmp(skb_mac_header(p), - skb_gro_mac_header(skb), - maclen); - NAPI_GRO_CB(p)->same_flow = !diffs; - NAPI_GRO_CB(p)->flush = 0; + backlog_lock_irq_disable(sd); + if (skb_queue_empty(&sd->input_pkt_queue)) { + /* + * Inline a custom version of __napi_complete(). + * only current cpu owns and manipulates this napi, + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ + napi->state &= NAPIF_STATE_THREADED; + again = false; + } else { + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); + } + backlog_unlock_irq_enable(sd); } + + if (work) + rps_input_queue_head_add(sd, work); + return work; } -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +/** + * __napi_schedule - schedule for receive + * @n: entry to schedule + * + * The entry's receive function will be scheduled to run. + * Consider using __napi_schedule_irqoff() if hard irqs are masked. + */ +void __napi_schedule(struct napi_struct *n) { - struct sk_buff **pp = NULL; - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int same_flow; - enum gro_result ret; + unsigned long flags; - if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) - goto normal; + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__napi_schedule); - if (skb_is_gso(skb) || skb_has_frag_list(skb)) - goto normal; +/** + * napi_schedule_prep - check if napi can be scheduled + * @n: napi context + * + * Test if NAPI routine is already running, and if not mark + * it as running. This is used as a condition variable to + * insure only one NAPI poll instance runs. We also make + * sure there is no pending NAPI disable. + */ +bool napi_schedule_prep(struct napi_struct *n) +{ + unsigned long new, val = READ_ONCE(n->state); - gro_list_prepare(napi, skb); + do { + if (unlikely(val & NAPIF_STATE_DISABLE)) + return false; + new = val | NAPIF_STATE_SCHED; - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_receive) - continue; + /* Sets STATE_MISSED bit if STATE_SCHED was already set + * This was suggested by Alexander Duyck, as compiler + * emits better code than : + * if (val & NAPIF_STATE_SCHED) + * new |= NAPIF_STATE_MISSED; + */ + new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * + NAPIF_STATE_MISSED; + } while (!try_cmpxchg(&n->state, &val, new)); - skb_set_network_header(skb, skb_gro_offset(skb)); - skb_reset_mac_len(skb); - NAPI_GRO_CB(skb)->same_flow = 0; - NAPI_GRO_CB(skb)->flush = 0; - NAPI_GRO_CB(skb)->free = 0; + return !(val & NAPIF_STATE_SCHED); +} +EXPORT_SYMBOL(napi_schedule_prep); - pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); - break; - } - rcu_read_unlock(); +/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * + * Variant of __napi_schedule() assuming hard irqs are masked. + * + * On PREEMPT_RT enabled kernels this maps to __napi_schedule() + * because the interrupt disabled assumption might not be true + * due to force-threaded interrupts and spinlock substitution. + */ +void __napi_schedule_irqoff(struct napi_struct *n) +{ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + else + __napi_schedule(n); +} +EXPORT_SYMBOL(__napi_schedule_irqoff); - if (&ptype->list == head) - goto normal; +bool napi_complete_done(struct napi_struct *n, int work_done) +{ + unsigned long flags, val, new, timeout = 0; + bool ret = true; - same_flow = NAPI_GRO_CB(skb)->same_flow; - ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + /* + * 1) Don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu. + * 2) If we are busy polling, do nothing here, we have + * the guarantee we will be called later. + */ + if (unlikely(n->state & (NAPIF_STATE_NPSVC | + NAPIF_STATE_IN_BUSY_POLL))) + return false; + + if (work_done) { + if (n->gro.bitmask) + timeout = napi_get_gro_flush_timeout(n); + n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = napi_get_gro_flush_timeout(n); + if (timeout) + ret = false; + } - if (pp) { - struct sk_buff *nskb = *pp; + /* + * When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer. + */ + gro_flush_normal(&n->gro, !!timeout); - *pp = nskb->next; - nskb->next = NULL; - napi_gro_complete(nskb); - napi->gro_count--; + if (unlikely(!list_empty(&n->poll_list))) { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); + list_del_init(&n->poll_list); + local_irq_restore(flags); } + WRITE_ONCE(n->list_owner, -1); - if (same_flow) - goto ok; + val = READ_ONCE(n->state); + do { + WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) - goto normal; + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_SCHED_THREADED | + NAPIF_STATE_PREFER_BUSY_POLL); - napi->gro_count++; - NAPI_GRO_CB(skb)->count = 1; - NAPI_GRO_CB(skb)->age = jiffies; - skb_shinfo(skb)->gso_size = skb_gro_len(skb); - skb->next = napi->gro_list; - napi->gro_list = skb; - ret = GRO_HELD; + /* If STATE_MISSED was set, leave STATE_SCHED set, + * because we will call napi->poll() one more time. + * This C code was suggested by Alexander Duyck to help gcc. + */ + new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * + NAPIF_STATE_SCHED; + } while (!try_cmpxchg(&n->state, &val, new)); -pull: - if (skb_headlen(skb) < skb_gro_offset(skb)) { - int grow = skb_gro_offset(skb) - skb_headlen(skb); + if (unlikely(val & NAPIF_STATE_MISSED)) { + __napi_schedule(n); + return false; + } - BUG_ON(skb->end - skb->tail < grow); + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; +} +EXPORT_SYMBOL(napi_complete_done); - memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); +static void skb_defer_free_flush(void) +{ + struct llist_node *free_list; + struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - skb->tail += grow; - skb->data_len -= grow; + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; - skb_shinfo(skb)->frags[0].page_offset += grow; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); - if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { - skb_frag_unref(skb, 0); - memmove(skb_shinfo(skb)->frags, - skb_shinfo(skb)->frags + 1, - --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + prefetch(next); + napi_consume_skb(skb, 1); } } +} -ok: - return ret; +#if defined(CONFIG_NET_RX_BUSY_POLL) + +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +{ + if (!skip_schedule) { + gro_normal_list(&napi->gro); + __napi_schedule(napi); + return; + } -normal: - ret = GRO_NORMAL; - goto pull; + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush_normal(&napi->gro, HZ >= 1000); + + clear_bit(NAPI_STATE_SCHED, &napi->state); } +enum { + NAPI_F_PREFER_BUSY_POLL = 1, + NAPI_F_END_ON_RESCHED = 2, +}; -static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, + unsigned flags, u16 budget) { - switch (ret) { - case GRO_NORMAL: - if (netif_receive_skb(skb)) - ret = GRO_DROP; - break; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + bool skip_schedule = false; + unsigned long timeout; + int rc; - case GRO_DROP: - kfree_skb(skb); - break; + /* Busy polling means there is a high chance device driver hard irq + * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was + * set in napi_schedule_prep(). + * Since we are about to call napi->poll() once more, we can safely + * clear NAPI_STATE_MISSED. + * + * Note: x86 could use a single "lock and ..." instruction + * to perform these two clear_bit() + */ + clear_bit(NAPI_STATE_MISSED, &napi->state); + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + + if (flags & NAPI_F_PREFER_BUSY_POLL) { + napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); + timeout = napi_get_gro_flush_timeout(napi); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } - case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) - kmem_cache_free(skbuff_head_cache, skb); - else - __kfree_skb(skb); - break; + /* All we really want here is to re-enable device interrupts. + * Ideally, a new ndo_busy_poll_stop() could avoid another round. + */ + rc = napi->poll(napi, budget); + /* We can't gro_normal_list() here, because napi->poll() might have + * rearmed the napi (napi_complete_done()) in which case it could + * already be running on another CPU. + */ + trace_napi_poll(napi, rc, budget); + netpoll_poll_unlock(have_poll_lock); + if (rc == budget) + __busy_poll_stop(napi, skip_schedule); + bpf_net_ctx_clear(bpf_net_ctx); + local_bh_enable(); +} + +static void __napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, unsigned flags, u16 budget) +{ + unsigned long start_time = loop_end ? busy_loop_current_time() : 0; + int (*napi_poll)(struct napi_struct *napi, int budget); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + void *have_poll_lock = NULL; + struct napi_struct *napi; - case GRO_HELD: - case GRO_MERGED: - break; + WARN_ON_ONCE(!rcu_read_lock_held()); + +restart: + napi_poll = NULL; + + napi = napi_by_id(napi_id); + if (!napi) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + for (;;) { + int work = 0; + + local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + if (!napi_poll) { + unsigned long val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) { + if (flags & NAPI_F_PREFER_BUSY_POLL) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + goto count; + } + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) { + if (flags & NAPI_F_PREFER_BUSY_POLL) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + goto count; + } + have_poll_lock = netpoll_poll_lock(napi); + napi_poll = napi->poll; + } + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); + gro_normal_list(&napi->gro); +count: + if (work > 0) + __NET_ADD_STATS(dev_net(napi->dev), + LINUX_MIB_BUSYPOLLRXPACKETS, work); + skb_defer_free_flush(); + bpf_net_ctx_clear(bpf_net_ctx); + local_bh_enable(); + + if (!loop_end || loop_end(loop_end_arg, start_time)) + break; + + if (unlikely(need_resched())) { + if (flags & NAPI_F_END_ON_RESCHED) + break; + if (napi_poll) + busy_poll_stop(napi, have_poll_lock, flags, budget); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (loop_end(loop_end_arg, start_time)) + return; + goto restart; + } + cpu_relax(); } + if (napi_poll) + busy_poll_stop(napi, have_poll_lock, flags, budget); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); +} - return ret; +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = NAPI_F_END_ON_RESCHED; + + if (prefer_busy_poll) + flags |= NAPI_F_PREFER_BUSY_POLL; + + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); } -static void skb_gro_reset_offset(struct sk_buff *skb) +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; + unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; - NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + rcu_read_lock(); + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); + rcu_read_unlock(); +} +EXPORT_SYMBOL(napi_busy_loop); + +void napi_suspend_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; - if (skb_mac_header(skb) == skb_tail_pointer(skb) && - pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0))) { - NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + unsigned long timeout = napi_get_irq_suspend_timeout(napi); + + if (timeout) + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); } + rcu_read_unlock(); } -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +void napi_resume_irqs(unsigned int napi_id) { - skb_gro_reset_offset(skb); + struct napi_struct *napi; - return napi_skb_finish(dev_gro_receive(napi, skb), skb); + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + /* If irq_suspend_timeout is set to 0 between the call to + * napi_suspend_irqs and now, the original value still + * determines the safety timeout as intended and napi_watchdog + * will resume irq processing. + */ + if (napi_get_irq_suspend_timeout(napi)) { + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); + } + } + rcu_read_unlock(); } -EXPORT_SYMBOL(napi_gro_receive); -static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +static void __napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) { - __skb_pull(skb, skb_headlen(skb)); - /* restore the reserve we had after netdev_alloc_skb_ip_align() */ - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - skb->vlan_tci = 0; - skb->dev = napi->dev; - skb->skb_iif = 0; + napi->gro.cached_napi_id = napi_id; - napi->skb = skb; + WRITE_ONCE(napi->napi_id, napi_id); + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } -struct sk_buff *napi_get_frags(struct napi_struct *napi) +static void napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) { - struct sk_buff *skb = napi->skb; + unsigned long flags; - if (!skb) { - skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); - if (skb) - napi->skb = skb; - } - return skb; + spin_lock_irqsave(&napi_hash_lock, flags); + WARN_ON_ONCE(napi_by_id(napi_id)); + __napi_hash_add_with_id(napi, napi_id); + spin_unlock_irqrestore(&napi_hash_lock, flags); } -EXPORT_SYMBOL(napi_get_frags); -static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, - gro_result_t ret) +static void napi_hash_add(struct napi_struct *napi) { - switch (ret) { - case GRO_NORMAL: - case GRO_HELD: - skb->protocol = eth_type_trans(skb, skb->dev); + unsigned long flags; - if (ret == GRO_HELD) - skb_gro_pull(skb, -ETH_HLEN); - else if (netif_receive_skb(skb)) - ret = GRO_DROP; - break; + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) + return; - case GRO_DROP: - case GRO_MERGED_FREE: - napi_reuse_skb(napi, skb); - break; + spin_lock_irqsave(&napi_hash_lock, flags); - case GRO_MERGED: - break; + /* 0..NR_CPUS range is reserved for sender_cpu use */ + do { + if (unlikely(!napi_id_valid(++napi_gen_id))) + napi_gen_id = MIN_NAPI_ID; + } while (napi_by_id(napi_gen_id)); + + __napi_hash_add_with_id(napi, napi_gen_id); + + spin_unlock_irqrestore(&napi_hash_lock, flags); +} + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +static void napi_hash_del(struct napi_struct *napi) +{ + unsigned long flags; + + spin_lock_irqsave(&napi_hash_lock, flags); + + hlist_del_init_rcu(&napi->napi_hash_node); + + spin_unlock_irqrestore(&napi_hash_lock, flags); +} + +static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) +{ + struct napi_struct *napi; + + napi = container_of(timer, struct napi_struct, timer); + + /* Note : we use a relaxed variant of napi_schedule_prep() not setting + * NAPI_STATE_MISSED, since we do not react to a device IRQ. + */ + if (!napi_disable_pending(napi) && + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + __napi_schedule_irqoff(napi); } - return ret; + return HRTIMER_NORESTART; } -static struct sk_buff *napi_frags_skb(struct napi_struct *napi) +static void napi_stop_kthread(struct napi_struct *napi) { - struct sk_buff *skb = napi->skb; - struct ethhdr *eth; - unsigned int hlen; - unsigned int off; + unsigned long val, new; - napi->skb = NULL; + /* Wait until the napi STATE_THREADED is unset. */ + while (true) { + val = READ_ONCE(napi->state); - skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); - - off = skb_gro_offset(skb); - hlen = off + sizeof(*eth); - eth = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - eth = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!eth)) { - napi_reuse_skb(napi, skb); - skb = NULL; - goto out; + /* If napi kthread own this napi or the napi is idle, + * STATE_THREADED can be unset here. + */ + if ((val & NAPIF_STATE_SCHED_THREADED) || + !(val & NAPIF_STATE_SCHED)) { + new = val & (~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL)); + } else { + msleep(20); + continue; } - } - skb_gro_pull(skb, sizeof(*eth)); + if (try_cmpxchg(&napi->state, &val, new)) + break; + } - /* - * This works because the only protocols we care about don't require - * special handling. We'll fix it up properly at the end. + /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by + * the kthread. */ - skb->protocol = eth->h_proto; + while (true) { + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) + break; -out: - return skb; + msleep(20); + } + + kthread_stop(napi->thread); + napi->thread = NULL; } -gro_result_t napi_gro_frags(struct napi_struct *napi) +static void napi_set_threaded_state(struct napi_struct *napi, + enum netdev_napi_threaded threaded_mode) { - struct sk_buff *skb = napi_frags_skb(napi); - - if (!skb) - return GRO_DROP; + bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED; + bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL; - return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll); } -EXPORT_SYMBOL(napi_gro_frags); -/* - * net_rps_action sends any pending IPI's for rps. - * Note: called with local irq disabled, but exits with local irq enabled. - */ -static void net_rps_action_and_irq_enable(struct softnet_data *sd) +int napi_set_threaded(struct napi_struct *napi, + enum netdev_napi_threaded threaded) { -#ifdef CONFIG_RPS - struct softnet_data *remsd = sd->rps_ipi_list; + if (threaded) { + if (!napi->thread) { + int err = napi_kthread_create(napi); - if (remsd) { - sd->rps_ipi_list = NULL; + if (err) + return err; + } + } - local_irq_enable(); + if (napi->config) + napi->config->threaded = threaded; - /* Send pending IPI's to kick RPS processing on remote cpus. */ - while (remsd) { - struct softnet_data *next = remsd->rps_ipi_next; + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + if (!threaded && napi->thread) { + napi_stop_kthread(napi); + } else { + /* Make sure kthread is created before THREADED bit is set. */ + smp_mb__before_atomic(); + napi_set_threaded_state(napi, threaded); + } - if (cpu_online(remsd->cpu)) - __smp_call_function_single(remsd->cpu, - &remsd->csd, 0); - remsd = next; - } - } else -#endif - local_irq_enable(); + return 0; } -static int process_backlog(struct napi_struct *napi, int quota) +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) { - int work = 0; - struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + struct napi_struct *napi; + int i, err = 0; -#ifdef CONFIG_RPS - /* Check if we have pending ipi, its better to send them now, - * not waiting net_rx_action() end. - */ - if (sd->rps_ipi_list) { - local_irq_disable(); - net_rps_action_and_irq_enable(sd); - } -#endif - napi->weight = weight_p; - local_irq_disable(); - while (work < quota) { - struct sk_buff *skb; - unsigned int qlen; + netdev_assert_locked_or_invisible(dev); - while ((skb = __skb_dequeue(&sd->process_queue))) { - local_irq_enable(); - __netif_receive_skb(skb); - local_irq_disable(); - input_queue_head_incr(sd); - if (++work >= quota) { - local_irq_enable(); - return work; + if (threaded) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (!napi->thread) { + err = napi_kthread_create(napi); + if (err) { + threaded = NETDEV_NAPI_THREADED_DISABLED; + break; + } } } + } - rps_lock(sd); - qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); + WRITE_ONCE(dev->threaded, threaded); - if (qlen < quota - work) { - /* - * Inline a custom version of __napi_complete(). - * only current cpu owns and manipulates this napi, - * and NAPI_STATE_SCHED is the only possible flag set on backlog. - * we can use a plain write instead of clear_bit(), - * and we dont need an smp_mb() memory barrier. - */ - list_del(&napi->poll_list); - napi->state = 0; + /* The error should not occur as the kthreads are already created. */ + list_for_each_entry(napi, &dev->napi_list, dev_list) + WARN_ON_ONCE(napi_set_threaded(napi, threaded)); - quota = work + qlen; - } - rps_unlock(sd); - } - local_irq_enable(); + /* Override the config for all NAPIs even if currently not listed */ + for (i = 0; i < dev->num_napi_configs; i++) + dev->napi_config[i].threaded = threaded; - return work; + return err; } /** - * __napi_schedule - schedule for receive - * @n: entry to schedule + * netif_threaded_enable() - enable threaded NAPIs + * @dev: net_device instance * - * The entry's receive function will be scheduled to run + * Enable threaded mode for the NAPI instances of the device. This may be useful + * for devices where multiple NAPI instances get scheduled by a single + * interrupt. Threaded NAPI allows moving the NAPI processing to cores other + * than the core where IRQ is mapped. + * + * This function should be called before @dev is registered. */ -void __napi_schedule(struct napi_struct *n) +void netif_threaded_enable(struct net_device *dev) { - unsigned long flags; + WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED)); +} +EXPORT_SYMBOL(netif_threaded_enable); - local_irq_save(flags); - ____napi_schedule(&__get_cpu_var(softnet_data), n); - local_irq_restore(flags); +/** + * netif_queue_set_napi - Associate queue with the napi + * @dev: device to which NAPI and queue belong + * @queue_index: Index of queue + * @type: queue type as RX or TX + * @napi: NAPI context, pass NULL to clear previously set NAPI + * + * Set queue with its corresponding napi context. This should be done after + * registering the NAPI handler for the queue-vector and the queues have been + * mapped to the corresponding interrupt vector. + */ +void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, struct napi_struct *napi) +{ + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + + if (WARN_ON_ONCE(napi && !napi->dev)) + return; + netdev_ops_assert_locked_or_invisible(dev); + + switch (type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(dev, queue_index); + rxq->napi = napi; + return; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(dev, queue_index); + txq->napi = napi; + return; + default: + return; + } } -EXPORT_SYMBOL(__napi_schedule); +EXPORT_SYMBOL(netif_queue_set_napi); -void __napi_complete(struct napi_struct *n) +static void +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) { - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - BUG_ON(n->gro_list); + struct napi_struct *napi = + container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + int err; +#endif + + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); - list_del(&n->poll_list); - smp_mb__before_clear_bit(); - clear_bit(NAPI_STATE_SCHED, &n->state); +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); + if (err) + netdev_warn(napi->dev, "RMAP update failed (%d)\n", + err); + } +#endif } -EXPORT_SYMBOL(__napi_complete); -void napi_complete(struct napi_struct *n) +#ifdef CONFIG_RFS_ACCEL +static void netif_napi_affinity_release(struct kref *ref) { - unsigned long flags; + struct napi_struct *napi = + container_of(ref, struct napi_struct, notify.kref); + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; - /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu - */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) + netdev_assert_locked(napi->dev); + WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, + &napi->state)); + + if (!napi->dev->rx_cpu_rmap_auto) return; + rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + cpu_rmap_put(rmap); +} - napi_gro_flush(n, false); - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + if (dev->rx_cpu_rmap_auto) + return 0; + + dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); + if (!dev->rx_cpu_rmap) + return -ENOMEM; + + dev->rx_cpu_rmap_auto = true; + return 0; } -EXPORT_SYMBOL(napi_complete); +EXPORT_SYMBOL(netif_enable_cpu_rmap); -/* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) +static void netif_del_cpu_rmap(struct net_device *dev) { - unsigned int hash = napi_id % HASH_SIZE(napi_hash); - struct napi_struct *napi; + struct cpu_rmap *rmap = dev->rx_cpu_rmap; - hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) - if (napi->napi_id == napi_id) - return napi; + if (!dev->rx_cpu_rmap_auto) + return; - return NULL; + /* Free the rmap */ + cpu_rmap_put(rmap); + dev->rx_cpu_rmap = NULL; + dev->rx_cpu_rmap_auto = false; } -EXPORT_SYMBOL_GPL(napi_by_id); -void napi_hash_add(struct napi_struct *napi) +#else +static void netif_napi_affinity_release(struct kref *ref) { - if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { +} - spin_lock(&napi_hash_lock); +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); - /* 0 is not a valid id, we also skip an id that is taken - * we expect both events to be extremely rare - */ - napi->napi_id = 0; - while (!napi->napi_id) { - napi->napi_id = ++napi_gen_id; - if (napi_by_id(napi->napi_id)) - napi->napi_id = 0; - } +static void netif_del_cpu_rmap(struct net_device *dev) +{ +} +#endif + +void netif_set_affinity_auto(struct net_device *dev) +{ + unsigned int i, maxqs, numa; + + maxqs = max(dev->num_tx_queues, dev->num_rx_queues); + numa = dev_to_node(&dev->dev); - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); + + dev->irq_affinity_auto = true; +} +EXPORT_SYMBOL(netif_set_affinity_auto); - spin_unlock(&napi_hash_lock); +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) +{ + int rc; + + netdev_assert_locked_or_invisible(napi->dev); + + if (napi->irq == irq) + return; + + /* Remove existing resources */ + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + napi->irq = irq; + if (irq < 0 || + (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) + return; + + /* Abort for buggy drivers */ + if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) + return; + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); + if (rc < 0) + return; + + cpu_rmap_get(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = rc; } +#endif + + /* Use core IRQ notifier */ + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) { + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + goto put_rmap; + } + + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + return; + +put_rmap: +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; + cpu_rmap_put(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = -1; + } +#endif + napi->notify.notify = NULL; + napi->notify.release = NULL; } -EXPORT_SYMBOL_GPL(napi_hash_add); +EXPORT_SYMBOL(netif_napi_set_irq_locked); -/* Warning : caller is responsible to make sure rcu grace period - * is respected before freeing memory containing @napi - */ -void napi_hash_del(struct napi_struct *napi) +static void napi_restore_config(struct napi_struct *n) { - spin_lock(&napi_hash_lock); + n->defer_hard_irqs = n->config->defer_hard_irqs; + n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) - hlist_del_rcu(&napi->napi_hash_node); + if (n->dev->irq_affinity_auto && + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) + irq_set_affinity(n->irq, &n->config->affinity_mask); + + /* a NAPI ID might be stored in the config, if so use it. if not, use + * napi_hash_add to generate one for us. + */ + if (n->config->napi_id) { + napi_hash_add_with_id(n, n->config->napi_id); + } else { + napi_hash_add(n); + n->config->napi_id = n->napi_id; + } - spin_unlock(&napi_hash_lock); + WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded)); } -EXPORT_SYMBOL_GPL(napi_hash_del); -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +static void napi_save_config(struct napi_struct *n) { + n->config->defer_hard_irqs = n->defer_hard_irqs; + n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; + napi_hash_del(n); +} + +/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will + * inherit an existing ID try to insert it at the right position. + */ +static void +netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) +{ + unsigned int new_id, pos_id; + struct list_head *higher; + struct napi_struct *pos; + + new_id = UINT_MAX; + if (napi->config && napi->config->napi_id) + new_id = napi->config->napi_id; + + higher = &dev->napi_list; + list_for_each_entry(pos, &dev->napi_list, dev_list) { + if (napi_id_valid(pos->napi_id)) + pos_id = pos->napi_id; + else if (pos->config) + pos_id = pos->config->napi_id; + else + pos_id = UINT_MAX; + + if (pos_id <= new_id) + break; + higher = &pos->dev_list; + } + list_add_rcu(&napi->dev_list, higher); /* adds after higher */ +} + +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + +void netif_napi_add_weight_locked(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + netdev_assert_locked(dev); + if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) + return; + INIT_LIST_HEAD(&napi->poll_list); - napi->gro_count = 0; - napi->gro_list = NULL; + INIT_HLIST_NODE(&napi->napi_hash_node); + hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + gro_init(&napi->gro); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) - pr_err_once("netif_napi_add() called with weight %d on device %s\n", - weight, dev->name); + netdev_err_once(dev, "%s() called with weight %d\n", __func__, + weight); napi->weight = weight; - list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; #ifdef CONFIG_NETPOLL - spin_lock_init(&napi->poll_lock); napi->poll_owner = -1; #endif + napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); + set_bit(NAPI_STATE_NPSVC, &napi->state); + netif_napi_dev_list_add(dev, napi); + + /* default settings from sysfs are applied to all NAPIs. any per-NAPI + * configuration will be loaded in napi_enable + */ + napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); + napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); + + napi_get_frags_check(napi); + /* Create kthread for this napi if dev->threaded is set. + * Clear dev->threaded if kthread creation failed so that + * threaded mode will not be enabled in napi_enable(). + */ + if (napi_get_threaded_config(dev, napi)) + if (napi_kthread_create(napi)) + dev->threaded = NETDEV_NAPI_THREADED_DISABLED; + netif_napi_set_irq_locked(napi, -1); } -EXPORT_SYMBOL(netif_napi_add); +EXPORT_SYMBOL(netif_napi_add_weight_locked); -void netif_napi_del(struct napi_struct *napi) +void napi_disable_locked(struct napi_struct *n) { - struct sk_buff *skb, *next; + unsigned long val, new; + + might_sleep(); + netdev_assert_locked(n->dev); + + set_bit(NAPI_STATE_DISABLE, &n->state); + + val = READ_ONCE(n->state); + do { + while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { + usleep_range(20, 200); + val = READ_ONCE(n->state); + } + + new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; + new &= ~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL | + NAPIF_STATE_PREFER_BUSY_POLL); + } while (!try_cmpxchg(&n->state, &val, new)); + + hrtimer_cancel(&n->timer); + + if (n->config) + napi_save_config(n); + else + napi_hash_del(n); + + clear_bit(NAPI_STATE_DISABLE, &n->state); +} +EXPORT_SYMBOL(napi_disable_locked); + +/** + * napi_disable() - prevent NAPI from scheduling + * @n: NAPI context + * + * Stop NAPI from being scheduled on this context. + * Waits till any outstanding processing completes. + * Takes netdev_lock() for associated net_device. + */ +void napi_disable(struct napi_struct *n) +{ + netdev_lock(n->dev); + napi_disable_locked(n); + netdev_unlock(n->dev); +} +EXPORT_SYMBOL(napi_disable); + +void napi_enable_locked(struct napi_struct *n) +{ + unsigned long new, val = READ_ONCE(n->state); - list_del_init(&napi->dev_list); + if (n->config) + napi_restore_config(n); + else + napi_hash_add(n); + + do { + BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); + + new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); + if (n->dev->threaded && n->thread) + new |= NAPIF_STATE_THREADED; + } while (!try_cmpxchg(&n->state, &val, new)); +} +EXPORT_SYMBOL(napi_enable_locked); + +/** + * napi_enable() - enable NAPI scheduling + * @n: NAPI context + * + * Enable scheduling of a NAPI instance. + * Must be paired with napi_disable(). + * Takes netdev_lock() for associated net_device. + */ +void napi_enable(struct napi_struct *n) +{ + netdev_lock(n->dev); + napi_enable_locked(n); + netdev_unlock(n->dev); +} +EXPORT_SYMBOL(napi_enable); + +/* Must be called in process context */ +void __netif_napi_del_locked(struct napi_struct *napi) +{ + netdev_assert_locked(napi->dev); + + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) + return; + + /* Make sure NAPI is disabled (or was never enabled). */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + if (napi->config) { + napi->index = -1; + napi->config = NULL; + } + + list_del_rcu(&napi->dev_list); napi_free_frags(napi); - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; - skb->next = NULL; - kfree_skb(skb); + gro_cleanup(&napi->gro); + + if (napi->thread) { + kthread_stop(napi->thread); + napi->thread = NULL; } +} +EXPORT_SYMBOL(__netif_napi_del_locked); - napi->gro_list = NULL; - napi->gro_count = 0; +static int __napi_poll(struct napi_struct *n, bool *repoll) +{ + int work, weight; + + weight = n->weight; + + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (napi_is_scheduled(n)) { + work = n->poll(n, weight); + trace_napi_poll(n, work, weight); + + xdp_do_check_flushed(n); + } + + if (unlikely(work > weight)) + netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", + n->poll, work, weight); + + if (likely(work < weight)) + return work; + + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(napi_disable_pending(n))) { + napi_complete(n); + return work; + } + + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + return work; + } + + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush_normal(&n->gro, HZ >= 1000); + + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. + */ + if (unlikely(!list_empty(&n->poll_list))) { + pr_warn_once("%s: Budget exhausted after napi rescheduled\n", + n->dev ? n->dev->name : "backlog"); + return work; + } + + *repoll = true; + + return work; } -EXPORT_SYMBOL(netif_napi_del); -static void net_rx_action(struct softirq_action *h) +static int napi_poll(struct napi_struct *n, struct list_head *repoll) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); - unsigned long time_limit = jiffies + 2; - int budget = netdev_budget; + bool do_repoll = false; void *have; + int work; - local_irq_disable(); - - while (!list_empty(&sd->poll_list)) { - struct napi_struct *n; - int work, weight; + list_del_init(&n->poll_list); - /* If softirq window is exhuasted then punt. - * Allow this to run for 2 jiffies since which will allow - * an average latency of 1.5/HZ. - */ - if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) - goto softnet_break; + have = netpoll_poll_lock(n); - local_irq_enable(); + work = __napi_poll(n, &do_repoll); - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + if (do_repoll) { +#if defined(CONFIG_DEBUG_NET) + if (unlikely(!napi_is_scheduled(n))) + pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n", + n->dev->name, n->poll); +#endif + list_add_tail(&n->poll_list, repoll); + } + netpoll_poll_unlock(have); - have = netpoll_poll_lock(n); + return work; +} - weight = n->weight; +static int napi_thread_wait(struct napi_struct *napi) +{ + set_current_state(TASK_INTERRUPTIBLE); - /* This NAPI_STATE_SCHED test is for avoiding a race - * with netpoll's poll_napi(). Only the entity which - * obtains the lock and sees NAPI_STATE_SCHED set will - * actually make the ->poll() call. Therefore we avoid - * accidentally calling ->poll() when NAPI is not scheduled. + while (!kthread_should_stop()) { + /* Testing SCHED_THREADED bit here to make sure the current + * kthread owns this napi and could poll on this napi. + * Testing SCHED bit is not enough because SCHED bit might be + * set by some other busy poll thread or by napi_disable(). */ - work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) { - work = n->poll(n, weight); - trace_napi_poll(n); + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { + WARN_ON(!list_empty(&napi->poll_list)); + __set_current_state(TASK_RUNNING); + return 0; } - WARN_ON_ONCE(work > weight); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); - budget -= work; + return -1; +} - local_irq_disable(); +static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) +{ + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + struct softnet_data *sd; + unsigned long last_qs = jiffies; - /* Drivers must not modify the NAPI state if they - * consume the entire weight. In such cases this code - * still "owns" the NAPI instance and therefore can - * move the instance around on the list at-will. + for (;;) { + bool repoll = false; + void *have; + + local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + + sd = this_cpu_ptr(&softnet_data); + sd->in_napi_threaded_poll = true; + + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + sd->in_napi_threaded_poll = false; + barrier(); + + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + skb_defer_free_flush(); + bpf_net_ctx_clear(bpf_net_ctx); + + /* When busy poll is enabled, the old packets are not flushed in + * napi_complete_done. So flush them here. */ - if (unlikely(work == weight)) { - if (unlikely(napi_disable_pending(n))) { - local_irq_enable(); - napi_complete(n); - local_irq_disable(); - } else { - if (n->gro_list) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - local_irq_enable(); - napi_gro_flush(n, HZ >= 1000); - local_irq_disable(); - } - list_move_tail(&n->poll_list, &sd->poll_list); - } + if (busy_poll) + gro_flush_normal(&napi->gro, HZ >= 1000); + local_bh_enable(); + + /* Call cond_resched here to avoid watchdog warnings. */ + if (repoll || busy_poll) { + rcu_softirq_qs_periodic(last_qs); + cond_resched(); } - netpoll_poll_unlock(have); + if (!repoll) + break; } -out: - net_rps_action_and_irq_enable(sd); +} -#ifdef CONFIG_NET_DMA - /* - * There may not be any more sk_buffs coming right now, so push - * any pending DMA copies to hardware - */ - dma_issue_pending_all(); -#endif +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + bool want_busy_poll; + bool in_busy_poll; + unsigned long val; - return; + while (!napi_thread_wait(napi)) { + val = READ_ONCE(napi->state); -softnet_break: - sd->time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - goto out; -} + want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL; + in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL; -struct netdev_upper { - struct net_device *dev; - bool master; - struct list_head list; - struct rcu_head rcu; - struct list_head search_list; -}; + if (unlikely(val & NAPIF_STATE_DISABLE)) + want_busy_poll = false; -static void __append_search_uppers(struct list_head *search_list, - struct net_device *dev) -{ - struct netdev_upper *upper; + if (want_busy_poll != in_busy_poll) + assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, + want_busy_poll); - list_for_each_entry(upper, &dev->upper_dev_list, list) { - /* check if this upper is not already in search list */ - if (list_empty(&upper->search_list)) - list_add_tail(&upper->search_list, search_list); + napi_threaded_poll_loop(napi, want_busy_poll); } + + return 0; } -static bool __netdev_search_upper_dev(struct net_device *dev, - struct net_device *upper_dev) +static __latent_entropy void net_rx_action(void) { - LIST_HEAD(search_list); - struct netdev_upper *upper; - struct netdev_upper *tmp; - bool ret = false; + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int budget = READ_ONCE(net_hotdata.netdev_budget); + LIST_HEAD(list); + LIST_HEAD(repoll); + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); +start: + sd->in_net_rx_action = true; + local_irq_disable(); + list_splice_init(&sd->poll_list, &list); + local_irq_enable(); - __append_search_uppers(&search_list, dev); - list_for_each_entry(upper, &search_list, search_list) { - if (upper->dev == upper_dev) { - ret = true; + for (;;) { + struct napi_struct *n; + + skb_defer_free_flush(); + + if (list_empty(&list)) { + if (list_empty(&repoll)) { + sd->in_net_rx_action = false; + barrier(); + /* We need to check if ____napi_schedule() + * had refilled poll_list while + * sd->in_net_rx_action was true. + */ + if (!list_empty(&sd->poll_list)) + goto start; + if (!sd_has_rps_ipi_waiting(sd)) + goto end; + } + break; + } + + n = list_first_entry(&list, struct napi_struct, poll_list); + budget -= napi_poll(n, &repoll); + + /* If softirq window is exhausted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || + time_after_eq(jiffies, time_limit))) { + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break; } - __append_search_uppers(&search_list, upper->dev); } - list_for_each_entry_safe(upper, tmp, &search_list, search_list) - INIT_LIST_HEAD(&upper->search_list); - return ret; + + local_irq_disable(); + + list_splice_tail_init(&sd->poll_list, &list); + list_splice_tail(&repoll, &list); + list_splice(&list, &sd->poll_list); + if (!list_empty(&sd->poll_list)) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + else + sd->in_net_rx_action = false; + + net_rps_action_and_irq_enable(sd); +end: + bpf_net_ctx_clear(bpf_net_ctx); } -static struct netdev_upper *__netdev_find_upper(struct net_device *dev, - struct net_device *upper_dev) +struct netdev_adjacent { + struct net_device *dev; + netdevice_tracker dev_tracker; + + /* upper master flag, there can only be one master device per list */ + bool master; + + /* lookup ignore flag */ + bool ignore; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + + /* private field for the users */ + void *private; + + struct list_head list; + struct rcu_head rcu; +}; + +static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, + struct list_head *adj_list) { - struct netdev_upper *upper; + struct netdev_adjacent *adj; - list_for_each_entry(upper, &dev->upper_dev_list, list) { - if (upper->dev == upper_dev) - return upper; + list_for_each_entry(adj, adj_list, list) { + if (adj->dev == adj_dev) + return adj; } return NULL; } +static int ____netdev_has_upper_dev(struct net_device *upper_dev, + struct netdev_nested_priv *priv) +{ + struct net_device *dev = (struct net_device *)priv->data; + + return upper_dev == dev; +} + /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -4425,13 +7959,40 @@ static struct netdev_upper *__netdev_find_upper(struct net_device *dev, bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + ASSERT_RTNL(); - return __netdev_find_upper(dev, upper_dev); + return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, + &priv); } EXPORT_SYMBOL(netdev_has_upper_dev); /** + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks the entire upper device chain. + * The caller must hold rcu lock. + */ + +bool netdev_has_upper_dev_all_rcu(struct net_device *dev, + struct net_device *upper_dev) +{ + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + + return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, + &priv); +} +EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); + +/** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * @@ -4442,7 +8003,7 @@ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); - return !list_empty(&dev->upper_dev_list); + return !list_empty(&dev->adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); @@ -4455,21 +8016,597 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev); */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); - if (list_empty(&dev->upper_dev_list)) + if (list_empty(&dev->adj_list.upper)) return NULL; - upper = list_first_entry(&dev->upper_dev_list, - struct netdev_upper, list); + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); +static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) +{ + struct netdev_adjacent *upper; + + ASSERT_RTNL(); + + if (list_empty(&dev->adj_list.upper)) + return NULL; + + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (likely(upper->master) && !upper->ignore) + return upper->dev; + return NULL; +} + +/** + * netdev_has_any_lower_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to a lower device and return true in case + * it is. The caller must hold the RTNL lock. + */ +static bool netdev_has_any_lower_dev(struct net_device *dev) +{ + ASSERT_RTNL(); + + return !list_empty(&dev->adj_list.lower); +} + +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + adj = list_entry(adj_list, struct netdev_adjacent, list); + + return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); + +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + +static struct net_device *__netdev_next_upper_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *upper; + + upper = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + *ignore = upper->ignore; + + return upper->dev; +} + +static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} + +static int __netdev_walk_all_upper_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.upper; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = __netdev_next_upper_dev(now, &iter, &ignore); + if (!udev) + break; + if (ignore) + continue; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + +int netdev_walk_all_upper_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.upper; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = netdev_next_upper_dev_rcu(now, &iter); + if (!udev) + break; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); + +static bool __netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) +{ + struct netdev_nested_priv priv = { + .flags = 0, + .data = (void *)upper_dev, + }; + + ASSERT_RTNL(); + + return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, + &priv); +} + +/** + * netdev_lower_get_next_private - Get the next ->private from the + * lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchanged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); + +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); + +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchanged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + +static struct net_device *netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} + +static struct net_device *__netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + *ignore = lower->ignore; + + return lower->dev; +} + +int netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); + +static int __netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = __netdev_next_lower_dev(now, &iter, &ignore); + if (!ldev) + break; + if (ignore) + continue; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_next_lower_dev_rcu); + +static u8 __netdev_upper_depth(struct net_device *dev) +{ + struct net_device *udev; + struct list_head *iter; + u8 max_depth = 0; + bool ignore; + + for (iter = &dev->adj_list.upper, + udev = __netdev_next_upper_dev(dev, &iter, &ignore); + udev; + udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < udev->upper_level) + max_depth = udev->upper_level; + } + + return max_depth; +} + +static u8 __netdev_lower_depth(struct net_device *dev) +{ + struct net_device *ldev; + struct list_head *iter; + u8 max_depth = 0; + bool ignore; + + for (iter = &dev->adj_list.lower, + ldev = __netdev_next_lower_dev(dev, &iter, &ignore); + ldev; + ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < ldev->lower_level) + max_depth = ldev->lower_level; + } + + return max_depth; +} + +static int __netdev_update_upper_level(struct net_device *dev, + struct netdev_nested_priv *__unused) +{ + dev->upper_level = __netdev_upper_depth(dev) + 1; + return 0; +} + +#ifdef CONFIG_LOCKDEP +static LIST_HEAD(net_unlink_list); + +static void net_unlink_todo(struct net_device *dev) +{ + if (list_empty(&dev->unlink_list)) + list_add_tail(&dev->unlink_list, &net_unlink_list); +} +#endif + +static int __netdev_update_lower_level(struct net_device *dev, + struct netdev_nested_priv *priv) +{ + dev->lower_level = __netdev_lower_depth(dev) + 1; + +#ifdef CONFIG_LOCKDEP + if (!priv) + return 0; + + if (priv->flags & NESTED_SYNC_IMM) + dev->nested_level = dev->lower_level - 1; + if (priv->flags & NESTED_SYNC_TODO) + net_unlink_todo(dev); +#endif + return 0; +} + +int netdev_walk_all_lower_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev_rcu(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); + +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) +{ + struct netdev_adjacent *lower; + + lower = list_first_or_null_rcu(&dev->adj_list.lower, + struct netdev_adjacent, list); + if (lower) + return lower->private; + return NULL; +} +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); + /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device @@ -4479,20 +8616,218 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get); */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; - upper = list_first_or_null_rcu(&dev->upper_dev_list, - struct netdev_upper, list); + upper = list_first_or_null_rcu(&dev->adj_list.upper, + struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); -static int __netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev, bool master) +static int netdev_adjacent_sysfs_add(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) { - struct netdev_upper *upper; + char linkname[IFNAMSIZ+7]; + + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", adj_dev->name); + return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), + linkname); +} +static void netdev_adjacent_sysfs_del(struct net_device *dev, + char *name, + struct list_head *dev_list) +{ + char linkname[IFNAMSIZ+7]; + + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", name); + sysfs_remove_link(&(dev->dev.kobj), linkname); +} + +static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + return (dev_list == &dev->adj_list.upper || + dev_list == &dev->adj_list.lower) && + net_eq(dev_net(dev), dev_net(adj_dev)); +} + +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list, + void *private, bool master) +{ + struct netdev_adjacent *adj; + int ret; + + adj = __netdev_find_adj(adj_dev, dev_list); + + if (adj) { + adj->ref_nr += 1; + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", + dev->name, adj_dev->name, adj->ref_nr); + + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->ref_nr = 1; + adj->private = private; + adj->ignore = false; + netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); + + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", + dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); + + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { + ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); + if (ret) + goto free_adj; + } + + /* Ensure that master link is always the first item in list. */ + if (master) { + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), "master"); + if (ret) + goto remove_symlinks; + + list_add_rcu(&adj->list, dev_list); + } else { + list_add_tail_rcu(&adj->list, dev_list); + } + + return 0; + +remove_symlinks: + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +free_adj: + netdev_put(adj_dev, &adj->dev_tracker); + kfree(adj); + + return ret; +} + +static void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, + u16 ref_nr, + struct list_head *dev_list) +{ + struct netdev_adjacent *adj; + + pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", + dev->name, adj_dev->name, ref_nr); + + adj = __netdev_find_adj(adj_dev, dev_list); + + if (!adj) { + pr_err("Adjacency does not exist for device %s from %s\n", + dev->name, adj_dev->name); + WARN_ON(1); + return; + } + + if (adj->ref_nr > ref_nr) { + pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", + dev->name, adj_dev->name, ref_nr, + adj->ref_nr - ref_nr); + adj->ref_nr -= ref_nr; + return; + } + + if (adj->master) + sysfs_remove_link(&(dev->dev.kobj), "master"); + + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); + + list_del_rcu(&adj->list); + pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + netdev_put(adj_dev, &adj->dev_tracker); + kfree_rcu(adj, rcu); +} + +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) +{ + int ret; + + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, + private, master); + if (ret) + return ret; + + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, + private, false); + if (ret) { + __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); + return ret; + } + + return 0; +} + +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + u16 ref_nr, + struct list_head *up_list, + struct list_head *down_list) +{ + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); +} + +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + void *private, bool master) +{ + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); +} + +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); +} + +static int __netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev, bool master, + void *upper_priv, void *upper_info, + struct netdev_nested_priv *priv, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + .extack = extack, + }, + .upper_dev = upper_dev, + .master = master, + .linking = true, + .upper_info = upper_info, + }; + struct net_device *master_dev; + int ret = 0; ASSERT_RTNL(); @@ -4500,37 +8835,58 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_search_upper_dev(upper_dev, dev)) + if (__netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; - if (__netdev_find_upper(dev, upper_dev)) - return -EEXIST; + if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) + return -EMLINK; - if (master && netdev_master_upper_dev_get(dev)) - return -EBUSY; + if (!master) { + if (__netdev_has_upper_dev(dev, upper_dev)) + return -EEXIST; + } else { + master_dev = __netdev_master_upper_dev_get(dev); + if (master_dev) + return master_dev == upper_dev ? -EEXIST : -EBUSY; + } - upper = kmalloc(sizeof(*upper), GFP_KERNEL); - if (!upper) - return -ENOMEM; + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + return ret; - upper->dev = upper_dev; - upper->master = master; - INIT_LIST_HEAD(&upper->search_list); + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, + master); + if (ret) + return ret; + + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + goto rollback; + + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, priv); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + priv); - /* Ensure that master upper link is always the first item in list. */ - if (master) - list_add_rcu(&upper->list, &dev->upper_dev_list); - else - list_add_tail_rcu(&upper->list, &dev->upper_dev_list); - dev_hold(upper_dev); - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; + +rollback: + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + return ret; } /** * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. @@ -4538,9 +8894,16 @@ static int __netdev_upper_dev_link(struct net_device *dev, * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + struct netlink_ext_ack *extack) { - return __netdev_upper_dev_link(dev, upper_dev, false); + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + + return __netdev_upper_dev_link(dev, upper_dev, false, + NULL, NULL, &priv, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -4548,6 +8911,9 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device + * @upper_priv: upper device private + * @upper_info: upper info to be passed down via notifier + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -4556,12 +8922,52 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) { - return __netdev_upper_dev_link(dev, upper_dev, true); + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + + return __netdev_upper_dev_link(dev, upper_dev, true, + upper_priv, upper_info, &priv, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); +static void __netdev_upper_dev_unlink(struct net_device *dev, + struct net_device *upper_dev, + struct netdev_nested_priv *priv) +{ + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + }, + .upper_dev = upper_dev, + .linking = false, + }; + + ASSERT_RTNL(); + + changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; + + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, + &changeupper_info.info); + + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, + &changeupper_info.info); + + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, priv); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + priv); +} + /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device @@ -4573,141 +8979,647 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_TODO, + .data = NULL, + }; + + __netdev_upper_dev_unlink(dev, upper_dev, &priv); +} +EXPORT_SYMBOL(netdev_upper_dev_unlink); + +static void __netdev_adjacent_dev_set(struct net_device *upper_dev, + struct net_device *lower_dev, + bool val) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); + if (adj) + adj->ignore = val; + + adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); + if (adj) + adj->ignore = val; +} + +static void netdev_adjacent_dev_disable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, true); +} + +static void netdev_adjacent_dev_enable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, false); +} + +int netdev_adjacent_change_prepare(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; + int err; + + if (!new_dev) + return 0; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_disable(dev, old_dev); + err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, + extack); + if (err) { + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + return err; + } + + return 0; +} +EXPORT_SYMBOL(netdev_adjacent_change_prepare); + +void netdev_adjacent_change_commit(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + + if (!new_dev || !old_dev) + return; + + if (new_dev == old_dev) + return; + + netdev_adjacent_dev_enable(dev, old_dev); + __netdev_upper_dev_unlink(old_dev, dev, &priv); +} +EXPORT_SYMBOL(netdev_adjacent_change_commit); + +void netdev_adjacent_change_abort(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; + + if (!new_dev) + return; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + + __netdev_upper_dev_unlink(new_dev, dev, &priv); +} +EXPORT_SYMBOL(netdev_adjacent_change_abort); + +/** + * netdev_bonding_info_change - Dispatch event about slave change + * @dev: device + * @bonding_info: info to dispatch + * + * Send NETDEV_BONDING_INFO to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_bonding_info_change(struct net_device *dev, + struct netdev_bonding_info *bonding_info) +{ + struct netdev_notifier_bonding_info info = { + .info.dev = dev, + }; + + memcpy(&info.bonding_info, bonding_info, + sizeof(struct netdev_bonding_info)); + call_netdevice_notifiers_info(NETDEV_BONDING_INFO, + &info.info); +} +EXPORT_SYMBOL(netdev_bonding_info_change); + +static int netdev_offload_xstats_enable_l3(struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + int err; + int rc; + + dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), + GFP_KERNEL); + if (!dev->offload_xstats_l3) + return -ENOMEM; + + rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, + NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + err = notifier_to_errno(rc); + if (err) + goto free_stats; + + return 0; + +free_stats: + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; + return err; +} + +int netdev_offload_xstats_enable(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return netdev_offload_xstats_enable_l3(dev, extack); + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_enable); + +static void netdev_offload_xstats_disable_l3(struct net_device *dev) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + + call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; +} + +int netdev_offload_xstats_disable(struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + if (!netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + netdev_offload_xstats_disable_l3(dev); + return 0; + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_disable); + +static void netdev_offload_xstats_disable_all(struct net_device *dev) +{ + netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); +} + +static struct rtnl_hw_stats64 * +netdev_offload_xstats_get_ptr(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return dev->offload_xstats_l3; + } + + WARN_ON(1); + return NULL; +} + +bool netdev_offload_xstats_enabled(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + return netdev_offload_xstats_get_ptr(dev, type); +} +EXPORT_SYMBOL(netdev_offload_xstats_enabled); + +struct netdev_notifier_offload_xstats_ru { + bool used; +}; + +struct netdev_notifier_offload_xstats_rd { + struct rtnl_hw_stats64 stats; + bool used; +}; + +static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, + const struct rtnl_hw_stats64 *src) +{ + dest->rx_packets += src->rx_packets; + dest->tx_packets += src->tx_packets; + dest->rx_bytes += src->rx_bytes; + dest->tx_bytes += src->tx_bytes; + dest->rx_errors += src->rx_errors; + dest->tx_errors += src->tx_errors; + dest->rx_dropped += src->rx_dropped; + dest->tx_dropped += src->tx_dropped; + dest->multicast += src->multicast; +} + +static int netdev_offload_xstats_get_used(struct net_device *dev, + enum netdev_offload_xstats_type type, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_ru report_used = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_used = &report_used, + }; + int rc; + + WARN_ON(!netdev_offload_xstats_enabled(dev, type)); + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, + &info.info); + *p_used = report_used.used; + return notifier_to_errno(rc); +} + +static int netdev_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_rd report_delta = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_delta = &report_delta, + }; + struct rtnl_hw_stats64 *stats; + int rc; + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return -EINVAL; + + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, + &info.info); + + /* Cache whatever we got, even if there was an error, otherwise the + * successful stats retrievals would get lost. + */ + netdev_hw_stats64_add(stats, &report_delta.stats); + + if (p_stats) + *p_stats = *stats; + *p_used = report_delta.used; + + return notifier_to_errno(rc); +} + +int netdev_offload_xstats_get(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, bool *p_used, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (p_stats) + return netdev_offload_xstats_get_stats(dev, type, p_stats, + p_used, extack); + else + return netdev_offload_xstats_get_used(dev, type, p_used, + extack); +} +EXPORT_SYMBOL(netdev_offload_xstats_get); + +void +netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, + const struct rtnl_hw_stats64 *stats) +{ + report_delta->used = true; + netdev_hw_stats64_add(&report_delta->stats, stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_report_delta); + +void +netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) +{ + report_used->used = true; +} +EXPORT_SYMBOL(netdev_offload_xstats_report_used); + +void netdev_offload_xstats_push_delta(struct net_device *dev, + enum netdev_offload_xstats_type type, + const struct rtnl_hw_stats64 *p_stats) +{ + struct rtnl_hw_stats64 *stats; ASSERT_RTNL(); - upper = __netdev_find_upper(dev, upper_dev); - if (!upper) + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) return; - list_del_rcu(&upper->list); - dev_put(upper_dev); - kfree_rcu(upper, rcu); - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + + netdev_hw_stats64_add(stats, p_stats); } -EXPORT_SYMBOL(netdev_upper_dev_unlink); +EXPORT_SYMBOL(netdev_offload_xstats_push_delta); + +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @dev: device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + +static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, + struct sock *sk) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_sk_get_lower_dev) + return NULL; + return ops->ndo_sk_get_lower_dev(dev, sk); +} + +/** + * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket + * @dev: device + * @sk: the socket + * + * %NULL is returned if no lower device is found. + */ + +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk) +{ + struct net_device *lower; + + lower = netdev_sk_get_lower_dev(dev, sk); + while (lower) { + dev = lower; + lower = netdev_sk_get_lower_dev(dev, sk); + } + + return dev; +} +EXPORT_SYMBOL(netdev_sk_get_lowest_dev); + +static void netdev_adjacent_add_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.lower); + } +} + +static void netdev_adjacent_del_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.lower); + } +} + +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + } +} + +void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + + +/** + * netdev_lower_state_changed - Dispatch event about lower device state change + * @lower_dev: device + * @lower_state_info: state to dispatch + * + * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info) +{ + struct netdev_notifier_changelowerstate_info changelowerstate_info = { + .info.dev = lower_dev, + }; + + ASSERT_RTNL(); + changelowerstate_info.lower_state_info = lower_state_info; + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, + &changelowerstate_info.info); +} +EXPORT_SYMBOL(netdev_lower_state_changed); static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; - if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) + if (ops->ndo_change_rx_flags) ops->ndo_change_rx_flags(dev, flags); } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; + unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); - dev->flags |= IFF_PROMISC; - dev->promiscuity += inc; - if (dev->promiscuity == 0) { + promiscuity = dev->promiscuity + inc; + if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ - if (inc < 0) - dev->flags &= ~IFF_PROMISC; - else { - dev->promiscuity -= inc; - pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", - dev->name); + if (unlikely(inc > 0)) { + netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_PROMISC; + } else { + flags = old_flags | IFF_PROMISC; } - if (dev->flags != old_flags) { - pr_info("device %s %s promiscuous mode\n", - dev->name, - dev->flags & IFF_PROMISC ? "entered" : "left"); + WRITE_ONCE(dev->promiscuity, promiscuity); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); + netdev_info(dev, "%s promiscuous mode\n", + dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); - audit_log(current->audit_context, GFP_ATOMIC, - AUDIT_ANOM_PROMISCUOUS, - "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", - dev->name, (dev->flags & IFF_PROMISC), - (old_flags & IFF_PROMISC), - from_kuid(&init_user_ns, audit_get_loginuid(current)), - from_kuid(&init_user_ns, uid), - from_kgid(&init_user_ns, gid), - audit_get_sessionid(current)); + audit_log(audit_context(), GFP_ATOMIC, + AUDIT_ANOM_PROMISCUOUS, + "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", + dev->name, (dev->flags & IFF_PROMISC), + (old_flags & IFF_PROMISC), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), + audit_get_sessionid(current)); } dev_change_rx_flags(dev, IFF_PROMISC); } + if (notify) { + /* The ops lock is only required to ensure consistent locking + * for `NETDEV_CHANGE` notifiers. This function is sometimes + * called without the lock, even for devices that are ops + * locked, such as in `dev_uc_sync_multiple` when using + * bonding or teaming. + */ + netdev_ops_assert_locked(dev); + __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); + } return 0; } -/** - * dev_set_promiscuity - update promiscuity count on a device - * @dev: device - * @inc: modifier - * - * Add or remove promiscuity from a device. While the count in the device - * remains above zero the interface remains promiscuous. Once it hits zero - * the device reverts back to normal filtering operation. A negative inc - * value is used to drop promiscuity on the device. - * Return 0 if successful or a negative errno code on error. - */ -int dev_set_promiscuity(struct net_device *dev, int inc) +int netif_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; - err = __dev_set_promiscuity(dev, inc); + err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); return err; } -EXPORT_SYMBOL(dev_set_promiscuity); - -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ -int dev_set_allmulti(struct net_device *dev, int inc) +int netif_set_allmulti(struct net_device *dev, int inc, bool notify) { - unsigned int old_flags = dev->flags; + unsigned int old_flags = dev->flags, old_gflags = dev->gflags; + unsigned int allmulti, flags; ASSERT_RTNL(); - dev->flags |= IFF_ALLMULTI; - dev->allmulti += inc; - if (dev->allmulti == 0) { + allmulti = dev->allmulti + inc; + if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ - if (inc < 0) - dev->flags &= ~IFF_ALLMULTI; - else { - dev->allmulti -= inc; - pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", - dev->name); + if (unlikely(inc > 0)) { + netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_ALLMULTI; + } else { + flags = old_flags | IFF_ALLMULTI; } - if (dev->flags ^ old_flags) { + WRITE_ONCE(dev->allmulti, allmulti); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); + netdev_info(dev, "%s allmulticast mode\n", + dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); + if (notify) + __dev_notify_flags(dev, old_flags, + dev->gflags ^ old_gflags, 0, NULL); } return 0; } -EXPORT_SYMBOL(dev_set_allmulti); /* * Upload unicast and multicast address lists to device and @@ -4731,10 +9643,10 @@ void __dev_set_rx_mode(struct net_device *dev) * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1); + __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1); + __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } @@ -4751,21 +9663,21 @@ void dev_set_rx_mode(struct net_device *dev) } /** - * dev_get_flags - get flags reported to userspace - * @dev: device + * netif_get_flags() - get flags reported to userspace + * @dev: device * - * Get the combination of flag bits exported through APIs to userspace. + * Get the combination of flag bits exported through APIs to userspace. */ -unsigned int dev_get_flags(const struct net_device *dev) +unsigned int netif_get_flags(const struct net_device *dev) { unsigned int flags; - flags = (dev->flags & ~(IFF_PROMISC | + flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | - (dev->gflags & (IFF_PROMISC | + (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { @@ -4779,9 +9691,10 @@ unsigned int dev_get_flags(const struct net_device *dev) return flags; } -EXPORT_SYMBOL(dev_get_flags); +EXPORT_SYMBOL(netif_get_flags); -int __dev_change_flags(struct net_device *dev, unsigned int flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; @@ -4814,38 +9727,47 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ - ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); - - if (!ret) - dev_set_rx_mode(dev); + if ((old_flags ^ flags) & IFF_UP) { + if (old_flags & IFF_UP) + __dev_close(dev); + else + ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; + old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); + + if (__dev_set_promiscuity(dev, inc, false) >= 0) + if (dev->flags != old_flags) + dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI - is important. Some (broken) drivers set IFF_PROMISC, when - IFF_ALLMULTI is requested not asking us and not reporting. + * is important. Some (broken) drivers set IFF_PROMISC, when + * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); + netif_set_allmulti(dev, inc, false); } return ret; } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges, u32 portid, + const struct nlmsghdr *nlh) { unsigned int changes = dev->flags ^ old_flags; + if (gchanges) + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh); + if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); @@ -4855,242 +9777,1081 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { - struct netdev_notifier_change_info change_info; + struct netdev_notifier_change_info change_info = { + .info = { + .dev = dev, + }, + .flags_changed = changes, + }; - change_info.flags_changed = changes; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, - &change_info.info); + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned int flags) +int netif_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; - unsigned int changes, old_flags = dev->flags; + unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; - ret = __dev_change_flags(dev, flags); + ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; - changes = old_flags ^ dev->flags; - if (changes) - rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - - __dev_notify_flags(dev, old_flags); + changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); + __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } -EXPORT_SYMBOL(dev_change_flags); + +int __netif_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_change_mtu) + return ops->ndo_change_mtu(dev, new_mtu); + + /* Pairs with all the lockless reads of dev->mtu in the stack */ + WRITE_ONCE(dev->mtu, new_mtu); + return 0; +} +EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL"); + +int dev_validate_mtu(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) +{ + /* MTU must be positive, and in range */ + if (new_mtu < 0 || new_mtu < dev->min_mtu) { + NL_SET_ERR_MSG(extack, "mtu less than device minimum"); + return -EINVAL; + } + + if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { + NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); + return -EINVAL; + } + return 0; +} /** - * dev_set_mtu - Change maximum transfer unit - * @dev: device - * @new_mtu: new transfer unit + * netif_set_mtu_ext() - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * @extack: netlink extended ack * - * Change the maximum transfer size of the network device. + * Change the maximum transfer size of the network device. + * + * Return: 0 on success, -errno on failure. */ -int dev_set_mtu(struct net_device *dev, int new_mtu) +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) { - const struct net_device_ops *ops = dev->netdev_ops; - int err; + int err, orig_mtu; + + netdev_ops_assert_locked(dev); if (new_mtu == dev->mtu) return 0; - /* MTU must be positive. */ - if (new_mtu < 0) - return -EINVAL; + err = dev_validate_mtu(dev, new_mtu, extack); + if (err) + return err; if (!netif_device_present(dev)) return -ENODEV; - err = 0; - if (ops->ndo_change_mtu) - err = ops->ndo_change_mtu(dev, new_mtu); - else - dev->mtu = new_mtu; + err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) + return err; - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + orig_mtu = dev->mtu; + err = __netif_set_mtu(dev, new_mtu); + + if (!err) { + err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + orig_mtu); + err = notifier_to_errno(err); + if (err) { + /* setting mtu back and notifying everyone again, + * so that they have a chance to revert changes. + */ + __netif_set_mtu(dev, orig_mtu); + call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + new_mtu); + } + } return err; } -EXPORT_SYMBOL(dev_set_mtu); -/** - * dev_set_group - Change group this device belongs to - * @dev: device - * @new_group: group this device should belong to - */ -void dev_set_group(struct net_device *dev, int new_group) +int netif_set_mtu(struct net_device *dev, int new_mtu) +{ + struct netlink_ext_ack extack; + int err; + + memset(&extack, 0, sizeof(extack)); + err = netif_set_mtu_ext(dev, new_mtu, &extack); + if (err && extack._msg) + net_err_ratelimited("%s: %s\n", dev->name, extack._msg); + return err; +} +EXPORT_SYMBOL(netif_set_mtu); + +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; + + if (new_len != orig_len) { + WRITE_ONCE(dev->tx_queue_len, new_len); + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); + res = notifier_to_errno(res); + if (res) + goto err_rollback; + res = dev_qdisc_change_tx_queue_len(dev); + if (res) + goto err_rollback; + } + + return 0; + +err_rollback: + netdev_err(dev, "refused to change device tx_queue_len\n"); + WRITE_ONCE(dev->tx_queue_len, orig_len); + return res; +} + +void netif_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } -EXPORT_SYMBOL(dev_set_group); /** - * dev_set_mac_address - Change Media Access Control Address - * @dev: device - * @sa: new address + * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack * - * Change the hardware (MAC) address of the device + * Return: 0 on success, -errno on failure. */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_pre_changeaddr_info info = { + .info.dev = dev, + .info.extack = extack, + .dev_addr = addr, + }; + int rc; + + rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); + return notifier_to_errno(rc); +} +EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL"); + +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; - if (sa->sa_family != dev->type) + if (ss->ss_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = ops->ndo_set_mac_address(dev, sa); + err = netif_pre_changeaddr_notify(dev, ss->__data, extack); if (err) return err; + if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) { + err = ops->ndo_set_mac_address(dev, ss); + if (err) + return err; + } dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } -EXPORT_SYMBOL(dev_set_mac_address); + +DECLARE_RWSEM(dev_addr_sem); + +/* "sa" is a true struct sockaddr with limited "sa_data" member. */ +int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +{ + size_t size = sizeof(sa->sa_data); + struct net_device *dev; + int ret = 0; + + down_read(&dev_addr_sem); + rcu_read_lock(); + + dev = dev_get_by_name_rcu(net, dev_name); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + if (!dev->addr_len) + memset(sa->sa_data, 0, size); + else + memcpy(sa->sa_data, dev->dev_addr, + min_t(size_t, size, dev->addr_len)); + sa->sa_family = dev->type; + +unlock: + rcu_read_unlock(); + up_read(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL"); + +int netif_change_carrier(struct net_device *dev, bool new_carrier) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_carrier) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_carrier(dev, new_carrier); +} /** - * dev_change_carrier - Change device carrier + * dev_get_phys_port_id - Get device physical port ID * @dev: device - * @new_carrier: new value + * @ppid: port ID * - * Change device carrier + * Get device physical port ID */ -int dev_change_carrier(struct net_device *dev, bool new_carrier) +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; - if (!ops->ndo_change_carrier) + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); +} + +/** + * dev_get_phys_port_name - Get device physical port name + * @dev: device + * @name: port name + * @len: limit of bytes to copy to name + * + * Get device physical port name + */ +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (ops->ndo_get_phys_port_name) { + err = ops->ndo_get_phys_port_name(dev, name, len); + if (err != -EOPNOTSUPP) + return err; + } + return devlink_compat_phys_port_name_get(dev, name, len); +} + +/** + * netif_get_port_parent_id() - Get the device's port parent identifier + * @dev: network device + * @ppid: pointer to a storage for the port's parent identifier + * @recurse: allow/disallow recursion to lower devices + * + * Get the devices's port parent identifier. + * + * Return: 0 on success, -errno on failure. + */ +int netif_get_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid, bool recurse) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct netdev_phys_item_id first = { }; + struct net_device *lower_dev; + struct list_head *iter; + int err; + + if (ops->ndo_get_port_parent_id) { + err = ops->ndo_get_port_parent_id(dev, ppid); + if (err != -EOPNOTSUPP) + return err; + } + + err = devlink_compat_switch_id_get(dev, ppid); + if (!recurse || err != -EOPNOTSUPP) + return err; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = netif_get_port_parent_id(lower_dev, ppid, true); + if (err) + break; + if (!first.id_len) + first = *ppid; + else if (memcmp(&first, ppid, sizeof(*ppid))) + return -EOPNOTSUPP; + } + + return err; +} +EXPORT_SYMBOL(netif_get_port_parent_id); + +/** + * netdev_port_same_parent_id - Indicate if two network devices have + * the same port parent identifier + * @a: first network device + * @b: second network device + */ +bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) +{ + struct netdev_phys_item_id a_id = { }; + struct netdev_phys_item_id b_id = { }; + + if (netif_get_port_parent_id(a, &a_id, true) || + netif_get_port_parent_id(b, &b_id, true)) + return false; + + return netdev_phys_item_id_same(&a_id, &b_id); +} +EXPORT_SYMBOL(netdev_port_same_parent_id); + +int netif_change_proto_down(struct net_device *dev, bool proto_down) +{ + if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; - return ops->ndo_change_carrier(dev, new_carrier); + if (proto_down) + netif_carrier_off(dev); + else + netif_carrier_on(dev); + WRITE_ONCE(dev->proto_down, proto_down); + return 0; } -EXPORT_SYMBOL(dev_change_carrier); /** - * dev_new_index - allocate an ifindex - * @net: the applicable net namespace + * netdev_change_proto_down_reason_locked - proto down reason * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. + * @dev: device + * @mask: proto down mask + * @value: proto down value */ -static int dev_new_index(struct net *net) +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value) { - int ifindex = net->ifindex; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return net->ifindex = ifindex; + u32 proto_down_reason; + int b; + + if (!mask) { + proto_down_reason = value; + } else { + proto_down_reason = dev->proto_down_reason; + for_each_set_bit(b, &mask, 32) { + if (value & (1 << b)) + proto_down_reason |= BIT(b); + else + proto_down_reason &= ~BIT(b); + } } + WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } -/* Delayed registration/unregisteration */ -static LIST_HEAD(net_todo_list); +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; -static void net_set_todo(struct net_device *dev) +static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) { - list_add_tail(&dev->todo_list, &net_todo_list); + if (flags & XDP_FLAGS_HW_MODE) + return XDP_MODE_HW; + if (flags & XDP_FLAGS_DRV_MODE) + return XDP_MODE_DRV; + if (flags & XDP_FLAGS_SKB_MODE) + return XDP_MODE_SKB; + return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; } -static void rollback_registered_many(struct list_head *head) +static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) { - struct net_device *dev, *tmp; + switch (mode) { + case XDP_MODE_SKB: + return generic_xdp_install; + case XDP_MODE_DRV: + case XDP_MODE_HW: + return dev->netdev_ops->ndo_bpf; + default: + return NULL; + } +} + +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; +} + +static struct bpf_prog *dev_xdp_prog(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; + return dev->xdp_state[mode].prog; +} + +u8 dev_xdp_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog || dev->xdp_state[i].link) + count++; + return count; +} +EXPORT_SYMBOL_GPL(dev_xdp_prog_count); + +u8 dev_xdp_sb_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog && + !dev->xdp_state[i].prog->aux->xdp_has_frags) + count++; + return count; +} + +int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + if (!dev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + bpf->command == XDP_SETUP_PROG && + bpf->prog && !bpf->prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(bpf->extack, + "unable to propagate XDP to device using tcp-data-split"); + return -EBUSY; + } + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); + return -EBUSY; + } + + return dev->netdev_ops->ndo_bpf(dev, bpf); +} +EXPORT_SYMBOL_GPL(netif_xdp_propagate); + +u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) +{ + struct bpf_prog *prog = dev_xdp_prog(dev, mode); + + return prog ? prog->aux->id : 0; +} + +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + +static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_prog *prog) +{ + dev->xdp_state[mode].link = NULL; + dev->xdp_state[mode].prog = prog; +} + +static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, + bpf_op_t bpf_op, struct netlink_ext_ack *extack, + u32 flags, struct bpf_prog *prog) +{ + struct netdev_bpf xdp; + int err; + + netdev_ops_assert_locked(dev); + + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + prog && !prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); + return -EBUSY; + } + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); + return -EBUSY; + } + + memset(&xdp, 0, sizeof(xdp)); + xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; + xdp.extack = extack; + xdp.flags = flags; + xdp.prog = prog; + + /* Drivers assume refcnt is already incremented (i.e, prog pointer is + * "moved" into driver), so they don't increment it on their own, but + * they do decrement refcnt when program is detached or replaced. + * Given net_device also owns link/prog, we need to bump refcnt here + * to prevent drivers from underflowing it. + */ + if (prog) + bpf_prog_inc(prog); + err = bpf_op(dev, &xdp); + if (err) { + if (prog) + bpf_prog_put(prog); + return err; + } + + if (mode != XDP_MODE_HW) + bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); + + return 0; +} + +static void dev_xdp_uninstall(struct net_device *dev) +{ + struct bpf_xdp_link *link; + struct bpf_prog *prog; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; - BUG_ON(dev_boot_phase); ASSERT_RTNL(); - list_for_each_entry_safe(dev, tmp, head, unreg_list) { - /* Some devices call without registering - * for initialization unwind. Remove those - * devices and proceed with the remaining. - */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - pr_debug("unregister_netdevice: device %s/%p never was registered\n", - dev->name, dev); + for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { + prog = dev_xdp_prog(dev, mode); + if (!prog) + continue; - WARN_ON(1); - list_del(&dev->unreg_list); + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) continue; + + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); + } +} + +static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) +{ + unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); + struct bpf_prog *cur_prog; + struct net_device *upper; + struct list_head *iter; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + int err; + + ASSERT_RTNL(); + + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } + /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ + if (num_modes > 1) { + NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); + return -EINVAL; + } + /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ + if (!num_modes && dev_xdp_prog_count(dev) > 1) { + NL_SET_ERR_MSG(extack, + "More than one program loaded, unset mode is ambiguous"); + return -EINVAL; + } + /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ + if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { + NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); + return -EINVAL; + } + + mode = dev_xdp_mode(dev, flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; + } + + /* don't allow if an upper device already has a program */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (dev_xdp_prog_count(upper) > 0) { + NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); + return -EEXIST; } - dev->dismantle = true; - BUG_ON(dev->reg_state != NETREG_REGISTERED); } - /* If device is running, close it first. */ - dev_close_many(head); + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } + if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { + NL_SET_ERR_MSG(extack, "Active program does not match expected"); + return -EEXIST; + } - list_for_each_entry(dev, head, unreg_list) { - /* And unlink it from device chain. */ - unlist_netdevice(dev); + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; - dev->reg_state = NETREG_UNREGISTERING; + if (new_prog) { + bool offload = mode == XDP_MODE_HW; + enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB + ? XDP_MODE_DRV : XDP_MODE_SKB; + + if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { + NL_SET_ERR_MSG(extack, "XDP program already attached"); + return -EBUSY; + } + if (!offload && dev_xdp_prog(dev, other_mode)) { + NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); + return -EEXIST; + } + if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); + return -EINVAL; + } + if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { + NL_SET_ERR_MSG(extack, "Program bound to different device"); + return -EINVAL; + } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + return -EINVAL; + } + if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + return -EINVAL; + } + if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); + return -EINVAL; + } } - synchronize_net(); + /* don't call drivers if the effective program didn't change */ + if (new_prog != cur_prog) { + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) { + NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); + return -EOPNOTSUPP; + } - list_for_each_entry(dev, head, unreg_list) { - /* Shutdown queueing discipline. */ - dev_shutdown(dev); + err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); + if (err) + return err; + } + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); + if (cur_prog) + bpf_prog_put(cur_prog); - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + return 0; +} - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} - /* - * Flush the unicast and multicast chains - */ - dev_uc_flush(dev); - dev_mc_flush(dev); +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); + ASSERT_RTNL(); - /* Notifier chain MUST detach us all upper devices. */ - WARN_ON(netdev_has_any_upper_dev(dev)); + mode = dev_xdp_mode(dev, link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); -#ifdef CONFIG_XPS - /* Remove XPS queueing entries */ - netif_reset_xps_queues_gt(dev, 0); + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) { + netdev_lock_ops(xdp_link->dev); + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + netdev_unlock_ops(xdp_link->dev); + xdp_link->dev = NULL; + } + + rtnl_unlock(); +} + +static int bpf_xdp_link_detach(struct bpf_link *link) +{ + bpf_xdp_link_release(link); + return 0; +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); +} + +static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + info->xdp.ifindex = ifindex; + return 0; +} + +static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + int err = 0; + + rtnl_lock(); + + /* link might have been auto-released already, so fail */ + if (!xdp_link->dev) { + err = -ENOLINK; + goto out_unlock; + } + + if (old_prog && link->prog != old_prog) { + err = -EPERM; + goto out_unlock; + } + old_prog = link->prog; + if (old_prog->type != new_prog->type || + old_prog->expected_attach_type != new_prog->expected_attach_type) { + err = -EINVAL; + goto out_unlock; + } + + if (old_prog == new_prog) { + /* no-op, don't disturb drivers */ + bpf_prog_put(new_prog); + goto out_unlock; + } + + netdev_lock_ops(xdp_link->dev); + mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); + bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); + err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, + xdp_link->flags, new_prog); + netdev_unlock_ops(xdp_link->dev); + if (err) + goto out_unlock; + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + rtnl_unlock(); + return err; +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, + .detach = bpf_xdp_link_detach, + .show_fdinfo = bpf_xdp_link_show_fdinfo, + .fill_link_info = bpf_xdp_link_fill_link_info, + .update_prog = bpf_xdp_link_update, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct netlink_ext_ack extack = {}; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + rtnl_lock(); + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) { + rtnl_unlock(); + return -EINVAL; + } + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto unlock; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog, + attr->link_create.attach_type); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto unlock; + } + + netdev_lock_ops(dev); + err = dev_xdp_attach_link(dev, &extack, link); + netdev_unlock_ops(dev); + rtnl_unlock(); + + if (err) { + link->dev = NULL; + bpf_link_cleanup(&link_primer); + trace_bpf_xdp_link_attach_failed(extack._msg); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +unlock: + rtnl_unlock(); + +out_put_dev: + dev_put(dev); + return err; +} + +/** + * dev_change_xdp_fd - set or clear a bpf program for a device rx path + * @dev: device + * @extack: netlink extended ack + * @fd: new program fd or negative value to clear + * @expected_fd: old program fd that userspace expects to replace or clear + * @flags: xdp-related flags + * + * Set or clear a bpf program for a device + */ +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags) +{ + enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); + struct bpf_prog *new_prog = NULL, *old_prog = NULL; + int err; + + ASSERT_RTNL(); + + if (fd >= 0) { + new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(new_prog)) + return PTR_ERR(new_prog); + } + + if (expected_fd >= 0) { + old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(old_prog)) { + err = PTR_ERR(old_prog); + old_prog = NULL; + goto err_out; + } + } + + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); + +err_out: + if (err && new_prog) + bpf_prog_put(new_prog); + if (old_prog) + bpf_prog_put(old_prog); + return err; +} + +u32 dev_get_min_mp_channel_count(const struct net_device *dev) +{ + int i; + + netdev_ops_assert_locked(dev); + + for (i = dev->real_num_rx_queues - 1; i >= 0; i--) + if (dev->_rx[i].mp_params.mp_priv) + /* The channel count is the idx plus 1. */ + return i + 1; + + return 0; +} + +/** + * dev_index_reserve() - allocate an ifindex in a namespace + * @net: the applicable net namespace + * @ifindex: requested ifindex, pass %0 to get one allocated + * + * Allocate a ifindex for a new device. Caller must either use the ifindex + * to store the device (via list_netdevice()) or call dev_index_release() + * to give the index up. + * + * Return: a suitable unique value for a new device interface number or -errno. + */ +static int dev_index_reserve(struct net *net, u32 ifindex) +{ + int err; + + if (ifindex > INT_MAX) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EINVAL; + } + + if (!ifindex) + err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, + xa_limit_31b, &net->ifindex, GFP_KERNEL); + else + err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); + if (err < 0) + return err; + + return ifindex; +} + +static void dev_index_release(struct net *net, int ifindex) +{ + /* Expect only unused indexes, unlist_netdevice() removes the used */ + WARN_ON(xa_erase(&net->dev_by_index, ifindex)); +} + +static bool from_cleanup_net(void) +{ +#ifdef CONFIG_NET_NS + return current == READ_ONCE(cleanup_net_task); +#else + return false; #endif +} + +/* Delayed registration/unregisteration */ +LIST_HEAD(net_todo_list); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +atomic_t dev_unreg_count = ATOMIC_INIT(0); + +static void net_set_todo(struct net_device *dev) +{ + list_add_tail(&dev->todo_list, &net_todo_list); +} + +static netdev_features_t netdev_sync_upper_features(struct net_device *lower, + struct net_device *upper, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(upper->wanted_features & feature) + && (features & feature)) { + netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", + &feature, upper->name); + features &= ~feature; + } } - synchronize_net(); + return features; +} - list_for_each_entry(dev, head, unreg_list) - dev_put(dev); +static void netdev_sync_lower_features(struct net_device *upper, + struct net_device *lower, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(features & feature) && (lower->features & feature)) { + netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", + &feature, lower->name); + netdev_lock_ops(lower); + lower->wanted_features &= ~feature; + __netdev_update_features(lower); + + if (unlikely(lower->features & feature)) + netdev_WARN(upper, "failed to disable %pNF on %s!\n", + &feature, lower->name); + else + netdev_features_change(lower); + netdev_unlock_ops(lower); + } + } } -static void rollback_registered(struct net_device *dev) +static bool netdev_has_ip_or_hw_csum(netdev_features_t features) { - LIST_HEAD(single); + netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; + bool hw_csum = features & NETIF_F_HW_CSUM; - list_add(&dev->unreg_list, &single); - rollback_registered_many(&single); - list_del(&single); + return ip_csum || hw_csum; } static netdev_features_t netdev_fix_features(struct net_device *dev, @@ -5122,6 +10883,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_TSO6; } + /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ + if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) + features &= ~NETIF_F_TSO_MANGLEID; + /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; @@ -5132,33 +10897,71 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_GSO; } - /* UFO needs SG and checksumming */ - if (features & NETIF_F_UFO) { - /* maybe split UFO into V4 and V6? */ - if (!((features & NETIF_F_GEN_CSUM) || - (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) - == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no checksum offload features.\n"); - features &= ~NETIF_F_UFO; + /* GSO partial features require GSO partial be set */ + if ((features & dev->gso_partial_features) && + !(features & NETIF_F_GSO_PARTIAL)) { + netdev_dbg(dev, + "Dropping partially supported GSO features since no GSO partial.\n"); + features &= ~dev->gso_partial_features; + } + + if (!(features & NETIF_F_RXCSUM)) { + /* NETIF_F_GRO_HW implies doing RXCSUM since every packet + * successfully merged by hardware must also have the + * checksum verified by hardware. If the user does not + * want to enable RXCSUM, logically, we should disable GRO_HW. + */ + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + + /* LRO/HW-GRO features cannot be combined with RX-FCS */ + if (features & NETIF_F_RXFCS) { + if (features & NETIF_F_LRO) { + netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_LRO; } - if (!(features & NETIF_F_SG)) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); - features &= ~NETIF_F_UFO; + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_GRO_HW; } } + if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { + netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); + features &= ~NETIF_F_LRO; + } + + if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; + } + + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } + + if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); + features &= ~NETIF_F_GSO_UDP_L4; + } + return features; } int __netdev_update_features(struct net_device *dev) { + struct net_device *upper, *lower; netdev_features_t features; - int err = 0; + struct list_head *iter; + int err = -1; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); features = netdev_get_wanted_features(dev); @@ -5168,26 +10971,81 @@ int __netdev_update_features(struct net_device *dev) /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); + /* some features can't be enabled if they're off on an upper device */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) + features = netdev_sync_upper_features(dev, upper, features); + if (dev->features == features) - return 0; + goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); + else + err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); + /* return non-0 since some features might have changed and + * it's better to fire a spurious notification than miss it + */ return -1; } - if (!err) +sync_lower: + /* some features must be disabled on lower devices when disabled + * on an upper device (think: bonding master or bridge) + */ + netdev_for_each_lower_dev(dev, lower, iter) + netdev_sync_lower_features(dev, lower, features); + + if (!err) { + netdev_features_t diff = features ^ dev->features; + + if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { + /* udp_tunnel_{get,drop}_rx_info both need + * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the + * device, or they won't do anything. + * Thus we need to update dev->features + * *before* calling udp_tunnel_get_rx_info, + * but *after* calling udp_tunnel_drop_rx_info. + */ + udp_tunnel_nic_lock(dev); + if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { + dev->features = features; + udp_tunnel_get_rx_info(dev); + } else { + udp_tunnel_drop_rx_info(dev); + } + udp_tunnel_nic_unlock(dev); + } + + if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { + if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_ctag_filter_info(dev); + } else { + vlan_drop_rx_ctag_filter_info(dev); + } + } + + if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { + if (features & NETIF_F_HW_VLAN_STAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_stag_filter_info(dev); + } else { + vlan_drop_rx_stag_filter_info(dev); + } + } + dev->features = features; + } - return 1; + return err < 0 ? 0 : 1; } /** @@ -5239,35 +11097,65 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); - if (netif_carrier_ok(rootdev)) { - if (!netif_carrier_ok(dev)) - netif_carrier_on(dev); - } else { - if (netif_carrier_ok(dev)) - netif_carrier_off(dev); - } + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + + if (netif_carrier_ok(rootdev)) + netif_carrier_on(dev); + else + netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_RPS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; + size_t sz = count * sizeof(*rx); + int err = 0; BUG_ON(count < 1); - rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; dev->_rx = rx; - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { rx[i].dev = dev; + + /* XDP RX-queue setup */ + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); + if (err < 0) + goto err_rxq_info; + } return 0; + +err_rxq_info: + /* Rollback successful reg's and free other resources */ + while (i--) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); + kvfree(dev->_rx); + dev->_rx = NULL; + return err; +} + +static void netif_free_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + + /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ + if (!dev->_rx) + return; + + for (i = 0; i < count; i++) + xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); + + kvfree(dev->_rx); } -#endif static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) @@ -5285,10 +11173,7 @@ static void netdev_init_one_queue(struct net_device *dev, static void netif_free_tx_queues(struct net_device *dev) { - if (is_vmalloc_addr(dev->_tx)) - vfree(dev->_tx); - else - kfree(dev->_tx); + kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) @@ -5297,14 +11182,13 @@ static int netif_alloc_netdev_queues(struct net_device *dev) struct netdev_queue *tx; size_t sz = count * sizeof(*tx); - BUG_ON(count < 1 || count > 0xffff); + if (count < 1 || count > 0xffff) + return -EINVAL; + + tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + if (!tx) + return -ENOMEM; - tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!tx) { - tx = vzalloc(sz); - if (!tx) - return -ENOMEM; - } dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -5313,28 +11197,93 @@ static int netif_alloc_netdev_queues(struct net_device *dev) return 0; } +void netif_tx_stop_all_queues(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + netif_tx_stop_queue(txq); + } +} +EXPORT_SYMBOL(netif_tx_stop_all_queues); + +static int netdev_do_alloc_pcpu_stats(struct net_device *dev) +{ + void __percpu *v; + + /* Drivers implementing ndo_get_peer_dev must support tstat + * accounting, so that skb_do_redirect() can bump the dev's + * RX stats upon network namespace switch. + */ + if (dev->netdev_ops->ndo_get_peer_dev && + dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) + return -EOPNOTSUPP; + + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return 0; + case NETDEV_PCPU_STAT_LSTATS: + v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); + break; + default: + return -EINVAL; + } + + return v ? 0 : -ENOMEM; +} + +static void netdev_do_free_pcpu_stats(struct net_device *dev) +{ + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return; + case NETDEV_PCPU_STAT_LSTATS: + free_percpu(dev->lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + free_percpu(dev->tstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + free_percpu(dev->dstats); + break; + } +} + +static void netdev_free_phy_link_topology(struct net_device *dev) +{ + struct phy_link_topology *topo = dev->link_topo; + + if (IS_ENABLED(CONFIG_PHYLIB) && topo) { + xa_destroy(&topo->phys); + kfree(topo); + dev->link_topo = NULL; + } +} + /** - * register_netdevice - register a network device - * @dev: device to register + * register_netdevice() - register a network device + * @dev: device to register * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * Callers must hold the rtnl semaphore. You may want - * register_netdev() instead of this. - * - * BUGS: - * The locking appears insufficient to guarantee two parallel registers - * will not get the same name. + * Take a prepared network device structure and make it externally accessible. + * A %NETDEV_REGISTER message is sent to the netdev notifier chain. + * Callers must hold the rtnl lock - you may want register_netdev() + * instead of this. */ - int register_netdevice(struct net_device *dev) { int ret; struct net *net = dev_net(dev); + BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < + NETDEV_FEATURE_COUNT); BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -5344,22 +11293,33 @@ int register_netdevice(struct net_device *dev) BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); + ret = ethtool_check_ops(dev->ethtool_ops); + if (ret) + return ret; + + /* rss ctx ID 0 is reserved for the default context, start from 1 */ + xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1); + mutex_init(&dev->ethtool->rss_lock); + spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); - dev->iflink = -1; - ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; + ret = -ENOMEM; + dev->name_node = netdev_name_node_head_alloc(dev); + if (!dev->name_node) + goto out; + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; - goto out; + goto err_free_name; } } @@ -5372,30 +11332,44 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } - ret = -EBUSY; - if (!dev->ifindex) - dev->ifindex = dev_new_index(net); - else if (__dev_get_by_index(net, dev->ifindex)) + ret = netdev_do_alloc_pcpu_stats(dev); + if (ret) goto err_uninit; - if (dev->iflink == -1) - dev->iflink = dev->ifindex; + ret = dev_index_reserve(net, dev->ifindex); + if (ret < 0) + goto err_free_pcpu; + dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ - dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; + + if (dev->udp_tunnel_nic_info) { + dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; + dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; + } + dev->wanted_features = dev->features & dev->hw_features; - /* Turn on no cache copy if HW is doing checksum */ - if (!(dev->flags & IFF_LOOPBACK)) { + if (!(dev->flags & IFF_LOOPBACK)) dev->hw_features |= NETIF_F_NOCACHE_COPY; - if (dev->features & NETIF_F_ALL_CSUM) { - dev->wanted_features |= NETIF_F_NOCACHE_COPY; - dev->features |= NETIF_F_NOCACHE_COPY; - } - } + + /* If IPv4 TCP segmentation offload is supported we should also + * allow the device to enable segmenting the frame with the option + * of ignoring a static IP ID value. This doesn't enable the + * feature itself but allows the user to enable it later. + */ + if (dev->hw_features & NETIF_F_TSO) + dev->hw_features |= NETIF_F_TSO_MANGLEID; + if (dev->vlan_features & NETIF_F_TSO) + dev->vlan_features |= NETIF_F_TSO_MANGLEID; + if (dev->mpls_features & NETIF_F_TSO) + dev->mpls_features |= NETIF_F_TSO_MANGLEID; + if (dev->hw_enc_features & NETIF_F_TSO) + dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ @@ -5403,7 +11377,7 @@ int register_netdevice(struct net_device *dev) /* Make NETIF_F_SG inheritable to tunnel devices. */ - dev->hw_enc_features |= NETIF_F_SG; + dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; /* Make NETIF_F_SG inheritable to MPLS. */ @@ -5412,14 +11386,20 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) - goto err_uninit; + goto err_ifindex_release; ret = netdev_register_kobject(dev); + + netdev_lock(dev); + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); + netdev_unlock(dev); + if (ret) - goto err_uninit; - dev->reg_state = NETREG_REGISTERED; + goto err_uninit_notify; + netdev_lock_ops(dev); __netdev_update_features(dev); + netdev_unlock_ops(dev); /* * Default initial state at registry is that the @@ -5431,8 +11411,10 @@ int register_netdevice(struct net_device *dev) linkwatch_init_dev(dev); dev_init_scheduler(dev); - dev_hold(dev); + + netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should @@ -5443,57 +11425,54 @@ int register_netdevice(struct net_device *dev) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ + netdev_lock_ops(dev); ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); ret = notifier_to_errno(ret); if (ret) { - rollback_registered(dev); - dev->reg_state = NETREG_UNREGISTERED; + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; + unregister_netdevice_queue(dev, NULL); + goto out; } /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: return ret; +err_uninit_notify: + call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); +err_ifindex_release: + dev_index_release(net, dev->ifindex); +err_free_pcpu: + netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); + if (dev->priv_destructor) + dev->priv_destructor(dev); +err_free_name: + netdev_name_node_free(dev->name_node); goto out; } EXPORT_SYMBOL(register_netdevice); -/** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initialize the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * The setup steps dummy netdevs need which normal netdevs get by going + * through register_netdevice(). */ -int init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev(struct net_device *dev) { - /* Clear everything. Note we don't initialize spinlocks - * are they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; - /* NAPI wants this */ - INIT_LIST_HEAD(&dev->napi_list); - /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); @@ -5502,11 +11481,7 @@ int init_dummy_netdev(struct net_device *dev) * because users of this 'device' dont need to change * its refcount. */ - - return 0; } -EXPORT_SYMBOL_GPL(init_dummy_netdev); - /** * register_netdev - register a network device @@ -5523,28 +11498,41 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev); */ int register_netdev(struct net_device *dev) { + struct net *net = dev_net(dev); int err; - rtnl_lock(); + if (rtnl_net_lock_killable(net)) + return -EINTR; + err = register_netdevice(dev); - rtnl_unlock(); + + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; +#else + return refcount_read(&dev->dev_refcnt); +#endif } EXPORT_SYMBOL(netdev_refcnt_read); +int netdev_unregister_timeout_secs __read_mostly = 10; + +#define WAIT_REFS_MIN_MSECS 1 +#define WAIT_REFS_MAX_MSECS 250 /** - * netdev_wait_allrefs - wait until all references are gone. - * @dev: target net_device + * netdev_wait_allrefs_any - wait until all references are gone. + * @list: list of net_devices to wait on * * This is called when unregistering network devices. * @@ -5554,51 +11542,69 @@ EXPORT_SYMBOL(netdev_refcnt_read); * We can get stuck here if buggy protocols don't correctly * call dev_put. */ -static void netdev_wait_allrefs(struct net_device *dev) +static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; - int refcnt; - - linkwatch_forget_dev(dev); + struct net_device *dev; + int wait = 0; rebroadcast_time = warning_time = jiffies; - refcnt = netdev_refcnt_read(dev); - while (refcnt != 0) { + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + list_for_each_entry(dev, list, todo_list) + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } + list_for_each_entry(dev, list, todo_list) + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + break; + } __rtnl_unlock(); rebroadcast_time = jiffies; } - msleep(250); + rcu_barrier(); - refcnt = netdev_refcnt_read(dev); + if (!wait) { + wait = WAIT_REFS_MIN_MSECS; + } else { + msleep(wait); + wait = min(wait << 1, WAIT_REFS_MAX_MSECS); + } + + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + if (time_after(jiffies, warning_time + + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { + list_for_each_entry(dev, list, todo_list) { + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, netdev_refcnt_read(dev)); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); + } - if (time_after(jiffies, warning_time + 10 * HZ)) { - pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", - dev->name, refcnt); warning_time = jiffies; } } @@ -5630,76 +11636,172 @@ static void netdev_wait_allrefs(struct net_device *dev) */ void netdev_run_todo(void) { + struct net_device *dev, *tmp; struct list_head list; + int cnt; +#ifdef CONFIG_LOCKDEP + struct list_head unlink_list; + + list_replace_init(&net_unlink_list, &unlink_list); + + while (!list_empty(&unlink_list)) { + dev = list_first_entry(&unlink_list, struct net_device, + unlink_list); + list_del_init(&dev->unlink_list); + dev->nested_level = dev->lower_level - 1; + } +#endif /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); __rtnl_unlock(); - /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); - while (!list_empty(&list)) { - struct net_device *dev - = list_first_entry(&list, struct net_device, todo_list); - list_del(&dev->todo_list); - - rtnl_lock(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - __rtnl_unlock(); - + list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { - pr_err("network todo '%s' but state %d\n", - dev->name, dev->reg_state); - dump_stack(); + netdev_WARN(dev, "run_todo but not unregistering\n"); + list_del(&dev->todo_list); continue; } - dev->reg_state = NETREG_UNREGISTERED; - - on_each_cpu(flush_backlog, dev, 1); + netdev_lock(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); + netdev_unlock(dev); + linkwatch_sync_dev(dev); + } - netdev_wait_allrefs(dev); + cnt = 0; + while (!list_empty(&list)) { + dev = netdev_wait_allrefs_any(&list); + list_del(&dev->todo_list); /* paranoia */ - BUG_ON(netdev_refcnt_read(dev)); + BUG_ON(netdev_refcnt_read(dev) != 1); + BUG_ON(!list_empty(&dev->ptype_all)); + BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); - WARN_ON(dev->dn_ptr); - if (dev->destructor) - dev->destructor(dev); + netdev_do_free_pcpu_stats(dev); + if (dev->priv_destructor) + dev->priv_destructor(dev); + if (dev->needs_free_netdev) + free_netdev(dev); + + cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } + if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) + wake_up(&netdev_unregistering_wq); +} + +/* Collate per-cpu network dstats statistics + * + * Read per-cpu network statistics from dev->dstats and populate the related + * fields in @s. + */ +static void dev_fetch_dstats(struct rtnl_link_stats64 *s, + const struct pcpu_dstats __percpu *dstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + u64 rx_packets, rx_bytes, rx_drops; + u64 tx_packets, tx_bytes, tx_drops; + const struct pcpu_dstats *stats; + unsigned int start; + + stats = per_cpu_ptr(dstats, cpu); + do { + start = u64_stats_fetch_begin(&stats->syncp); + rx_packets = u64_stats_read(&stats->rx_packets); + rx_bytes = u64_stats_read(&stats->rx_bytes); + rx_drops = u64_stats_read(&stats->rx_drops); + tx_packets = u64_stats_read(&stats->tx_packets); + tx_bytes = u64_stats_read(&stats->tx_bytes); + tx_drops = u64_stats_read(&stats->tx_drops); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + + s->rx_packets += rx_packets; + s->rx_bytes += rx_bytes; + s->rx_dropped += rx_drops; + s->tx_packets += tx_packets; + s->tx_bytes += tx_bytes; + s->tx_dropped += tx_drops; + } +} + +/* ndo_get_stats64 implementation for dtstats-based accounting. + * + * Populate @s from dev->stats and dev->dstats. This is used internally by the + * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. + */ +static void dev_get_dstats64(const struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_dstats(s, dev->dstats); } -/* Convert net_device_stats to rtnl_link_stats64. They have the same - * fields in the same order, with only the type differing. +/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has + * all the same fields in the same order as net_device_stats, with only + * the type differing, but rtnl_link_stats64 may have additional fields + * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { -#if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); - memcpy(stats64, netdev_stats, sizeof(*stats64)); -#else - size_t i, n = sizeof(*stats64) / sizeof(u64); - const unsigned long *src = (const unsigned long *)netdev_stats; + size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); + const atomic_long_t *src = (atomic_long_t *)netdev_stats; u64 *dst = (u64 *)stats64; - BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != - sizeof(*stats64) / sizeof(u64)); + BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) - dst[i] = src[i]; -#endif + dst[i] = (unsigned long)atomic_long_read(&src[i]); + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + n * sizeof(u64), 0, + sizeof(*stats64) - n * sizeof(u64)); } EXPORT_SYMBOL(netdev_stats_to_stats64); +static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc( + struct net_device *dev) +{ + struct net_device_core_stats __percpu *p; + + p = alloc_percpu_gfp(struct net_device_core_stats, + GFP_ATOMIC | __GFP_NOWARN); + + if (p && cmpxchg(&dev->core_stats, NULL, p)) + free_percpu(p); + + /* This READ_ONCE() pairs with the cmpxchg() above */ + return READ_ONCE(dev->core_stats); +} + +noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) +{ + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); + unsigned long __percpu *field; + + if (unlikely(!p)) { + p = netdev_core_stats_alloc(dev); + if (!p) + return; + } + + field = (unsigned long __percpu *)((void __percpu *)p + offset); + this_cpu_inc(*field); +} +EXPORT_SYMBOL_GPL(netdev_core_stats_inc); + /** * dev_get_stats - get network device statistics * @dev: device to get statistics from @@ -5714,20 +11816,102 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; + const struct net_device_core_stats __percpu *p; + + /* + * IPv{4,6} and udp tunnels share common stat helpers and use + * different stat type (NETDEV_PCPU_STAT_TSTATS vs + * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. + */ + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != + offsetof(struct pcpu_dstats, rx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != + offsetof(struct pcpu_dstats, rx_packets)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != + offsetof(struct pcpu_dstats, tx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != + offsetof(struct pcpu_dstats, tx_packets)); if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { + dev_get_tstats64(dev, storage); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { + dev_get_dstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } - storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + p = READ_ONCE(dev->core_stats); + if (p) { + const struct net_device_core_stats *core_stats; + int i; + + for_each_possible_cpu(i) { + core_stats = per_cpu_ptr(p, i); + storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); + storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); + storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); + storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); + } + } return storage; } EXPORT_SYMBOL(dev_get_stats); +/** + * dev_fetch_sw_netstats - get per-cpu network device statistics + * @s: place to store stats + * @netstats: per-cpu network stats to read from + * + * Read per-cpu network statistics and populate the related fields in @s. + */ +void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, + const struct pcpu_sw_netstats __percpu *netstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + const struct pcpu_sw_netstats *stats; + unsigned int start; + + stats = per_cpu_ptr(netstats, cpu); + do { + start = u64_stats_fetch_begin(&stats->syncp); + rx_packets = u64_stats_read(&stats->rx_packets); + rx_bytes = u64_stats_read(&stats->rx_bytes); + tx_packets = u64_stats_read(&stats->tx_packets); + tx_bytes = u64_stats_read(&stats->tx_bytes); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + + s->rx_packets += rx_packets; + s->rx_bytes += rx_bytes; + s->tx_packets += tx_packets; + s->tx_bytes += tx_bytes; + } +} +EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); + +/** + * dev_get_tstats64 - ndo_get_stats64 implementation + * @dev: device to get statistics from + * @s: place to store stats + * + * Populate @s from dev->stats and dev->tstats. Can be used as + * ndo_get_stats64() callback. + */ +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_sw_netstats(s, dev->tstats); +} +EXPORT_SYMBOL_GPL(dev_get_tstats64); + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); @@ -5739,8 +11923,8 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); - queue->qdisc = &noop_qdisc; - queue->qdisc_sleeping = &noop_qdisc; + RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); + RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc); rcu_assign_pointer(dev->ingress_queue, queue); #endif return queue; @@ -5757,24 +11941,44 @@ void netdev_set_default_ethtool_ops(struct net_device *dev, EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); /** - * alloc_netdev_mqs - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @setup: callback to initialize device - * @txqs: the number of TX subqueues to allocate - * @rxqs: the number of RX subqueues to allocate + * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default + * @dev: netdev to enable the IRQ coalescing on * - * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. Also allocates subquue structs - * for each queue on the device. + * Sets a conservative default for SW IRQ coalescing. Users can use + * sysfs attributes to override the default values. + */ +void netdev_sw_irq_coalesce_default_on(struct net_device *dev) +{ + WARN_ON(dev->reg_state == NETREG_REGISTERED); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + netdev_set_gro_flush_timeout(dev, 20000); + netdev_set_defer_hard_irqs(dev, 1); + } +} +EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); + +/** + * alloc_netdev_mqs - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. Also allocates subqueue structs + * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { struct net_device *dev; - size_t alloc_size; - struct net_device *p; + size_t napi_config_sz; + unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -5783,32 +11987,29 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_RPS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } -#endif - alloc_size = sizeof(struct net_device); - if (sizeof_priv) { - /* ensure 32-byte alignment of private area */ - alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); - alloc_size += sizeof_priv; - } - /* ensure 32-byte alignment of whole construct */ - alloc_size += NETDEV_ALIGN - 1; + maxqs = max(txqs, rxqs); - p = kzalloc(alloc_size, GFP_KERNEL); - if (!p) + dev = kvzalloc(struct_size(dev, priv, sizeof_priv), + GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + if (!dev) return NULL; - dev = PTR_ALIGN(p, NETDEV_ALIGN); - dev->padded = (char *)dev - (char *)p; + dev->priv_len = sizeof_priv; + ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev"); +#ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) - goto free_p; + goto free_dev; + __dev_hold(dev); +#else + refcount_set(&dev->dev_refcnt, 1); +#endif if (dev_addr_init(dev)) goto free_pcpu; @@ -5818,32 +12019,76 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; + dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; + dev->tso_max_size = TSO_LEGACY_MAX_SIZE; + dev->tso_max_segs = TSO_MAX_SEGS; + dev->upper_level = 1; + dev->lower_level = 1; +#ifdef CONFIG_LOCKDEP + dev->nested_level = 0; + INIT_LIST_HEAD(&dev->unlink_list); +#endif INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); - INIT_LIST_HEAD(&dev->upper_dev_list); - dev->priv_flags = IFF_XMIT_DST_RELEASE; + INIT_LIST_HEAD(&dev->adj_list.upper); + INIT_LIST_HEAD(&dev->adj_list.lower); + INIT_LIST_HEAD(&dev->ptype_all); + INIT_LIST_HEAD(&dev->ptype_specific); + INIT_LIST_HEAD(&dev->net_notifier_list); +#ifdef CONFIG_NET_SCHED + hash_init(dev->qdisc_hash); +#endif + + mutex_init(&dev->lock); + + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); + if (!dev->tx_queue_len) { + dev->priv_flags |= IFF_NO_QUEUE; + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + } + dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_RPS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; -#endif + dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); + if (!dev->ethtool) + goto free_all; + + dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); + if (!dev->cfg) + goto free_all; + dev->cfg_pending = dev->cfg; - strcpy(dev->name, name); + dev->num_napi_configs = maxqs; + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); + dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); + if (!dev->napi_config) + goto free_all; + + strscpy(dev->name, name); + dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; + + nf_hook_netdev_init(dev); + return dev; free_all: @@ -5851,56 +12096,92 @@ free_all: return NULL; free_pcpu: +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); - netif_free_tx_queues(dev); -#ifdef CONFIG_RPS - kfree(dev->_rx); +free_dev: #endif - -free_p: - kfree(p); + kvfree(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); +static void netdev_napi_exit(struct net_device *dev) +{ + if (!list_empty(&dev->napi_list)) { + struct napi_struct *p, *n; + + netdev_lock(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + __netif_napi_del_locked(p); + netdev_unlock(dev); + + synchronize_net(); + } + + kvfree(dev->napi_config); +} + /** - * free_netdev - free network device - * @dev: device + * free_netdev - free network device + * @dev: device * - * This function does the last stage of destroying an allocated device - * interface. The reference to the device object is released. - * If this is the last reference then it will be freed. + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. If this + * is the last reference then it will be freed.Must be called in process + * context. */ void free_netdev(struct net_device *dev) { - struct napi_struct *p, *n; + might_sleep(); - release_net(dev_net(dev)); + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + WARN_ON(dev->cfg != dev->cfg_pending); + kfree(dev->cfg); + kfree(dev->ethtool); netif_free_tx_queues(dev); -#ifdef CONFIG_RPS - kfree(dev->_rx); -#endif + netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); - list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) - netif_napi_del(p); + netdev_napi_exit(dev); + netif_del_cpu_rmap(dev); + + ref_tracker_dir_exit(&dev->refcnt_tracker); +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; +#endif + free_percpu(dev->core_stats); + dev->core_stats = NULL; + free_percpu(dev->xdp_bulkq); + dev->xdp_bulkq = NULL; + + netdev_free_phy_link_topology(dev); + + mutex_destroy(&dev->lock); /* Compatibility with error handling in drivers */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - kfree((char *)dev - dev->padded); + if (dev->reg_state == NETREG_UNINITIALIZED || + dev->reg_state == NETREG_DUMMY) { + kvfree(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; + WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); @@ -5908,6 +12189,19 @@ void free_netdev(struct net_device *dev) EXPORT_SYMBOL(free_netdev); /** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, + init_dummy_netdev); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + +/** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. @@ -5916,13 +12210,28 @@ EXPORT_SYMBOL(free_netdev); void synchronize_net(void) { might_sleep(); - if (rtnl_is_locked()) + if (from_cleanup_net() || rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); } EXPORT_SYMBOL(synchronize_net); +static void netdev_rss_contexts_free(struct net_device *dev) +{ + struct ethtool_rxfh_context *ctx; + unsigned long context; + + mutex_lock(&dev->ethtool->rss_lock); + xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { + xa_erase(&dev->ethtool->rss_ctx, context); + dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL); + kfree(ctx); + } + xa_destroy(&dev->ethtool->rss_ctx); + mutex_unlock(&dev->ethtool->rss_lock); +} + /** * unregister_netdevice_queue - remove device from the kernel * @dev: device @@ -5943,26 +12252,196 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) if (head) { list_move_tail(&dev->unreg_list, head); } else { - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + unregister_netdevice_many(&single); } } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + +/* devices must be UP and netdev_lock()'d */ +static void netif_close_many_and_unlock(struct list_head *close_head) +{ + struct net_device *dev, *tmp; + + netif_close_many(close_head, false); + + /* ... now unlock them */ + list_for_each_entry_safe(dev, tmp, close_head, close_list) { + netdev_unlock(dev); + list_del_init(&dev->close_list); + } +} + +static void netif_close_many_and_unlock_cond(struct list_head *close_head) +{ +#ifdef CONFIG_LOCKDEP + /* We can only track up to MAX_LOCK_DEPTH locks per task. + * + * Reserve half the available slots for additional locks possibly + * taken by notifiers and (soft)irqs. + */ + unsigned int limit = MAX_LOCK_DEPTH / 2; + + if (lockdep_depth(current) > limit) + netif_close_many_and_unlock(close_head); +#endif +} + +void unregister_netdevice_many_notify(struct list_head *head, + u32 portid, const struct nlmsghdr *nlh) +{ + struct net_device *dev, *tmp; + LIST_HEAD(close_head); + int cnt = 0; + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + if (list_empty(head)) + return; + + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never was registered\n", + dev->name, dev); + + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } + dev->dismantle = true; + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } + + /* If device is running, close it first. Start with ops locked... */ + list_for_each_entry(dev, head, unreg_list) { + if (!(dev->flags & IFF_UP)) + continue; + if (netdev_need_ops_lock(dev)) { + list_add_tail(&dev->close_list, &close_head); + netdev_lock(dev); + } + netif_close_many_and_unlock_cond(&close_head); + } + netif_close_many_and_unlock(&close_head); + /* ... now go over the rest. */ + list_for_each_entry(dev, head, unreg_list) { + if (!netdev_need_ops_lock(dev)) + list_add_tail(&dev->close_list, &close_head); + } + netif_close_many(&close_head, true); + + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); + netdev_lock(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); + netdev_unlock(dev); + } + flush_all_backlogs(); + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + + /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); + dev_shutdown(dev); + dev_tcx_uninstall(dev); + dev_xdp_uninstall(dev); + dev_memory_provider_uninstall(dev); + netdev_unlock_ops(dev); + bpf_dev_bound_netdev_unregister(dev); + + netdev_offload_xstats_disable_all(dev); + + /* Notify protocols, that we are about to destroy + * this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, + GFP_KERNEL, NULL, 0, + portid, nlh); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + + netdev_rss_contexts_free(dev); + + call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); + + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + mutex_destroy(&dev->ethtool->rss_lock); + + net_shaper_flush_netdev(dev); + + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); + + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); + WARN_ON(netdev_has_any_lower_dev(dev)); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif + } + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + netdev_put(dev, &dev->dev_registered_tracker); + net_set_todo(dev); + cnt++; + } + atomic_add(cnt, &dev_unreg_count); + + list_del(head); +} + /** * unregister_netdevice_many - unregister many devices * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { - struct net_device *dev; - - if (!list_empty(head)) { - rollback_registered_many(head); - list_for_each_entry(dev, head, unreg_list) - net_set_todo(dev); - } + unregister_netdevice_many_notify(head, 0, NULL); } EXPORT_SYMBOL(unregister_netdevice_many); @@ -5979,85 +12458,129 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - rtnl_lock(); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_unlock(); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); -/** - * dev_change_net_namespace - move device to different nethost namespace - * @dev: device - * @net: network namespace - * @pat: If not NULL name pattern to try if the current device name - * is already taken in the destination network namespace. - * - * This function shuts down a device interface and moves it - * to a new network namespace. On success 0 is returned, on - * a failure a netagive errno code is returned. - * - * Callers must hold the rtnl semaphore. - */ - -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex, + struct netlink_ext_ack *extack) { - int err; + struct netdev_name_node *name_node; + struct net *net_old = dev_net(dev); + char new_name[IFNAMSIZ] = {}; + int err, new_nsid; ASSERT_RTNL(); /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_immutable) { + NL_SET_ERR_MSG(extack, "The interface netns is immutable"); goto out; + } - /* Ensure the device has been registrered */ - if (dev->reg_state != NETREG_REGISTERED) + /* Ensure the device has been registered */ + if (dev->reg_state != NETREG_REGISTERED) { + NL_SET_ERR_MSG(extack, "The interface isn't registered"); goto out; + } /* Get out if there is nothing todo */ err = 0; - if (net_eq(dev_net(dev), net)) + if (net_eq(net_old, net)) goto out; /* Pick the destination device name, and ensure * we can use it in the destination network namespace. */ err = -EEXIST; - if (__dev_get_by_name(net, dev->name)) { + if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ - if (!pat) + if (!pat) { + NL_SET_ERR_MSG(extack, + "An interface with the same name exists in the target netns"); goto out; - if (dev_get_valid_name(net, dev, pat) < 0) + } + err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "Unable to use '%s' for the new interface name in the target netns", + pat); goto out; + } + } + /* Check that none of the altnames conflicts. */ + err = -EEXIST; + netdev_for_each_altname(dev, name_node) { + if (netdev_name_in_use(net, name_node->name)) { + NL_SET_ERR_MSG_FMT(extack, + "An interface with the altname %s exists in the target netns", + name_node->name); + goto out; + } + } + + /* Check that new_ifindex isn't used yet. */ + if (new_ifindex) { + err = dev_index_reserve(net, new_ifindex); + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "The ifindex %d is not available in the target netns", + new_ifindex); + goto out; + } + } else { + /* If there is an ifindex conflict assign a new one */ + err = dev_index_reserve(net, dev->ifindex); + if (err == -EBUSY) + err = dev_index_reserve(net, 0); + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Unable to allocate a new ifindex in the target netns"); + goto out; + } + new_ifindex = err; } /* * And now a mini version of register_netdevice unregister_netdevice. */ + netdev_lock_ops(dev); /* If device is running close it first. */ - dev_close(dev); - + netif_close(dev); /* And unlink it from device chain */ - err = -ENODEV; unlist_netdevice(dev); + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); + synchronize_net(); /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); + netdev_unlock_ops(dev); /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - - Note that dev->reg_state stays at NETREG_REGISTERED. - This is wanted because this way 8021q and macvlan know - the device is just moving and can keep their slaves up. - */ + * this device. They should clean all the things. + * + * Note that dev->reg_state stays at NETREG_REGISTERED. + * This is wanted because this way 8021q and macvlan know + * the device is just moving and can keep their slaves up. + */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + + new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); + + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, + new_ifindex); /* * Flush the unicast and multicast chains @@ -6067,55 +12590,69 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); + netdev_adjacent_del_links(dev); + + /* Move per-net netdevice notifiers that are following the netdevice */ + move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); + dev->ifindex = new_ifindex; - /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) { - int iflink = (dev->iflink == dev->ifindex); - dev->ifindex = dev_new_index(net); - if (iflink) - dev->iflink = dev->ifindex; + if (new_name[0]) { + /* Rename the netdev to prepared name */ + write_seqlock_bh(&netdev_rename_lock); + strscpy(dev->name, new_name, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); } + /* Fixup kobjects */ + dev_set_uevent_suppress(&dev->dev, 1); + err = device_rename(&dev->dev, dev->name); + dev_set_uevent_suppress(&dev->dev, 0); + WARN_ON(err); + /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); - /* Fixup kobjects */ - err = device_rename(&dev->dev, dev->name); + /* Adapt owner in case owning user namespace of target network + * namespace is different from the original one. + */ + err = netdev_change_owner(dev, net_old, net); WARN_ON(err); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); - /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); synchronize_net(); err = 0; out: return err; } -EXPORT_SYMBOL_GPL(dev_change_net_namespace); -static int dev_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *ocpu) +static int dev_cpu_dead(unsigned int oldcpu) { struct sk_buff **list_skb; struct sk_buff *skb; - unsigned int cpu, oldcpu = (unsigned long)ocpu; - struct softnet_data *sd, *oldsd; - - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; + unsigned int cpu; + struct softnet_data *sd, *oldsd, *remsd = NULL; local_irq_disable(); cpu = smp_processor_id(); @@ -6137,29 +12674,47 @@ static int dev_cpu_callback(struct notifier_block *nfb, oldsd->output_queue = NULL; oldsd->output_queue_tailp = &oldsd->output_queue; } - /* Append NAPI poll list from offline CPU. */ - if (!list_empty(&oldsd->poll_list)) { - list_splice_init(&oldsd->poll_list, &sd->poll_list); - raise_softirq_irqoff(NET_RX_SOFTIRQ); + /* Append NAPI poll list from offline CPU, with one exception : + * process_backlog() must be called by cpu owning percpu backlog. + * We properly handle process_queue & input_pkt_queue later. + */ + while (!list_empty(&oldsd->poll_list)) { + struct napi_struct *napi = list_first_entry(&oldsd->poll_list, + struct napi_struct, + poll_list); + + list_del_init(&napi->poll_list); + if (napi->poll == process_backlog) + napi->state &= NAPIF_STATE_THREADED; + else + ____napi_schedule(sd, napi); } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + if (!use_backlog_threads()) { +#ifdef CONFIG_RPS + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; +#endif + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + } + /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } - return NOTIFY_OK; + return 0; } - /** * netdev_increment_features - increment feature set by one * @all: current feature set @@ -6173,27 +12728,115 @@ static int dev_cpu_callback(struct notifier_block *nfb, netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { - if (mask & NETIF_F_GEN_CSUM) - mask |= NETIF_F_ALL_CSUM; + if (mask & NETIF_F_HW_CSUM) + mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; - all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ - if (all & NETIF_F_GEN_CSUM) - all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + if (all & NETIF_F_HW_CSUM) + all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } EXPORT_SYMBOL(netdev_increment_features); +/** + * netdev_compute_master_upper_features - compute feature from lowers + * @dev: the upper device + * @update_header: whether to update upper device's header_len/headroom/tailroom + * + * Recompute the upper device's feature based on all lower devices. + */ +void netdev_compute_master_upper_features(struct net_device *dev, bool update_header) +{ + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES; + netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES; + netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES; + netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES; + netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES; + unsigned short max_header_len = ETH_HLEN; + unsigned int tso_max_size = TSO_MAX_SIZE; + unsigned short max_headroom = 0; + unsigned short max_tailroom = 0; + u16 tso_max_segs = TSO_MAX_SEGS; + struct net_device *lower_dev; + struct list_head *iter; + + mpls_features = netdev_base_features(mpls_features); + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + gso_partial_features = netdev_increment_features(gso_partial_features, + lower_dev->gso_partial_features, + MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES); + + vlan_features = netdev_increment_features(vlan_features, + lower_dev->vlan_features, + MASTER_UPPER_DEV_VLAN_FEATURES); + + enc_features = netdev_increment_features(enc_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_ENC_FEATURES); + + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + xfrm_features = netdev_increment_features(xfrm_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_XFRM_FEATURES); + + mpls_features = netdev_increment_features(mpls_features, + lower_dev->mpls_features, + MASTER_UPPER_DEV_MPLS_FEATURES); + + dst_release_flag &= lower_dev->priv_flags; + + if (update_header) { + max_header_len = max(max_header_len, lower_dev->hard_header_len); + max_headroom = max(max_headroom, lower_dev->needed_headroom); + max_tailroom = max(max_tailroom, lower_dev->needed_tailroom); + } + + tso_max_size = min(tso_max_size, lower_dev->tso_max_size); + tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs); + } + + dev->gso_partial_features = gso_partial_features; + dev->vlan_features = vlan_features; + dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + dev->hw_enc_features |= xfrm_features; + dev->mpls_features = mpls_features; + + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + dev->priv_flags |= IFF_XMIT_DST_RELEASE; + + if (update_header) { + dev->hard_header_len = max_header_len; + dev->needed_headroom = max_headroom; + dev->needed_tailroom = max_tailroom; + } + + netif_set_tso_max_segs(dev, tso_max_segs); + netif_set_tso_max_size(dev, tso_max_size); + + netdev_change_features(dev); +} +EXPORT_SYMBOL(netdev_compute_master_upper_features); + static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; - hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); @@ -6204,8 +12847,10 @@ static struct hlist_head * __net_init netdev_create_hash(void) /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { - if (net != &init_net) - INIT_LIST_HEAD(&net->dev_base_head); + BUILD_BUG_ON(GRO_HASH_BUCKETS > + BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); + + INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) @@ -6215,6 +12860,10 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); + + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); + return 0; err_idx: @@ -6245,51 +12894,45 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -static int __netdev_printk(const char *level, const struct net_device *dev, - struct va_format *vaf) +static void __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) { - int r; - if (dev && dev->dev.parent) { - r = dev_printk_emit(level[1] - '0', - dev->dev.parent, - "%s %s %s: %pV", - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), vaf); + dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s%s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + printk("%s%s%s: %pV", + level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { - r = printk("%s(NULL net_device): %pV", level, vaf); + printk("%s(NULL net_device): %pV", level, vaf); } - - return r; } -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...) +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = __netdev_printk(level, dev, &vaf); + __netdev_printk(level, dev, &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ -int func(const struct net_device *dev, const char *fmt, ...) \ +void func(const struct net_device *dev, const char *fmt, ...) \ { \ - int r; \ struct va_format vaf; \ va_list args; \ \ @@ -6298,11 +12941,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \ vaf.fmt = fmt; \ vaf.va = &args; \ \ - r = __netdev_printk(level, dev, &vaf); \ + __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ - \ - return r; \ } \ EXPORT_SYMBOL(func); @@ -6318,6 +12959,9 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); + xa_destroy(&net->dev_by_index); + if (net != &init_net) + WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } static struct pernet_operations __net_initdata netdev_net_ops = { @@ -6325,28 +12969,36 @@ static struct pernet_operations __net_initdata netdev_net_ops = { .exit = netdev_exit, }; -static void __net_exit default_device_exit(struct net *net) +static void __net_exit default_device_exit_net(struct net *net) { + struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ - rtnl_lock(); + ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_immutable) continue; /* Leave virtual devices for the generic cleanup */ - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); + if (netdev_name_in_use(&init_net, fb_name)) + snprintf(fb_name, IFNAMSIZ, "dev%%d"); + + netdev_for_each_altname_safe(dev, name_node, tmp) + if (netdev_name_in_use(&init_net, name_node->name)) + __netdev_name_node_alt_destroy(name_node); + err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", @@ -6354,7 +13006,6 @@ static void __net_exit default_device_exit(struct net *net) BUG(); } } - rtnl_unlock(); } static void __net_exit default_device_exit_batch(struct list_head *net_list) @@ -6370,23 +13021,81 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { + default_device_exit_net(net); + cond_resched(); + } + + list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); } } unregister_netdevice_many(&dev_kill_list); - list_del(&dev_kill_list); rtnl_unlock(); } static struct pernet_operations __net_initdata default_device_ops = { - .exit = default_device_exit, .exit_batch = default_device_exit_batch, }; +static void __init net_dev_struct_check(void) +{ + /* TX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); +#ifdef CONFIG_XPS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); + + /* TXRX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); + + /* RX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); +#ifdef CONFIG_NETPOLL + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); +} + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -6394,6 +13103,67 @@ static struct pernet_operations __net_initdata default_device_ops = { * */ +/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ +#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) + +static int net_page_pool_create(int cpuid) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + struct page_pool_params page_pool_params = { + .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, + .flags = PP_FLAG_SYSTEM_POOL, + .nid = cpu_to_mem(cpuid), + }; + struct page_pool *pp_ptr; + int err; + + pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); + if (IS_ERR(pp_ptr)) + return -ENOMEM; + + err = xdp_reg_page_pool(pp_ptr); + if (err) { + page_pool_destroy(pp_ptr); + return err; + } + + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; +#endif + return 0; +} + +static int backlog_napi_should_run(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + + napi_threaded_poll_loop(&sd->backlog, false); +} + +static void backlog_napi_setup(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + napi->thread = this_cpu_read(backlog_napi); + set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { + .store = &backlog_napi, + .thread_should_run = backlog_napi_should_run, + .thread_fn = run_backlog_napi, + .thread_comm = "backlog_napi/%u", + .setup = backlog_napi_setup, +}; + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -6404,18 +13174,17 @@ static int __init net_dev_init(void) BUG_ON(!dev_boot_phase); + net_dev_struct_check(); + if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; - INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); - INIT_LIST_HEAD(&offload_base); - if (register_pernet_subsys(&netdev_net_ops)) goto out; @@ -6423,32 +13192,41 @@ static int __init net_dev_init(void) * Initialise the packet receive queues. */ + flush_backlogs_fallback = flush_backlogs_alloc(); + if (!flush_backlogs_fallback) + goto out; + for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); - memset(sd, 0, sizeof(*sd)); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); - sd->completion_queue = NULL; +#ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +#endif INIT_LIST_HEAD(&sd->poll_list); - sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS - sd->csd.func = rps_trigger_softirq; - sd->csd.info = sd; - sd->csd.flags = 0; + INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); + gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; - sd->backlog.gro_list = NULL; - sd->backlog.gro_count = 0; + INIT_LIST_HEAD(&sd->backlog.poll_list); -#ifdef CONFIG_NET_FLOW_LIMIT - sd->flow_limit = NULL; -#endif + if (net_page_pool_create(i)) + goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; + if (use_backlog_threads()) + smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; @@ -6470,10 +13248,29 @@ static int __init net_dev_init(void) open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); - hotcpu_notifier(dev_cpu_callback, 0); - dst_init(); + rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", + NULL, dev_cpu_dead); + WARN_ON(rc < 0); rc = 0; + + /* avoid static key IPIs to isolated CPUs */ + if (housekeeping_enabled(HK_TYPE_MISC)) + net_enable_timestamp(); out: + if (rc < 0) { + for_each_possible_cpu(i) { + struct page_pool *pp_ptr; + + pp_ptr = per_cpu(system_page_pool.pool, i); + if (!pp_ptr) + continue; + + xdp_unreg_page_pool(pp_ptr); + page_pool_destroy(pp_ptr); + per_cpu(system_page_pool.pool, i) = NULL; + } + } + return rc; } |
