diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 2722 |
1 files changed, 1427 insertions, 1295 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index a39354ee1432..3430b1ed12e5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -97,8 +97,6 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> #include <linux/stat.h> #include <net/dst.h> #include <net/pkt_sched.h> @@ -106,12 +104,10 @@ #include <net/xfrm.h> #include <linux/highmem.h> #include <linux/init.h> -#include <linux/kmod.h> #include <linux/module.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> -#include <net/wext.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> @@ -132,9 +128,9 @@ #include <linux/pci.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> -#include <linux/net_tstamp.h> #include <linux/static_key.h> -#include <net/flow_keys.h> +#include <linux/hashtable.h> +#include <linux/vmalloc.h> #include "net-sysfs.h" @@ -144,40 +140,11 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) -/* - * The list of packet types we will receive (as opposed to discard) - * and the routines to invoke. - * - * Why 16. Because with 16 the only overlap we get on a hash of the - * low nibble of the protocol value is RARP/SNAP/X.25. - * - * NOTE: That is no longer true with the addition of VLAN tags. Not - * sure which should go first, but I bet it won't make much - * difference if we are running VLANs. The good news is that - * this protocol won't be in the list unless compiled in, so - * the average user (w/out VLANs) will not be adversely affected. - * --BLG - * - * 0800 IP - * 8100 802.1Q VLAN - * 0001 802.3 - * 0002 AX.25 - * 0004 802.2 - * 8035 RARP - * 0005 SNAP - * 0805 X.25 - * 0806 ARP - * 8137 IPX - * 0009 Localtalk - * 86DD IPv6 - */ - -#define PTYPE_HASH_SIZE (16) -#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) - static DEFINE_SPINLOCK(ptype_lock); -static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -static struct list_head ptype_all __read_mostly; /* Taps */ +static DEFINE_SPINLOCK(offload_lock); +struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +struct list_head ptype_all __read_mostly; /* Taps */ +static struct list_head offload_base __read_mostly; /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -201,6 +168,14 @@ static struct list_head ptype_all __read_mostly; /* Taps */ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + +static seqcount_t devnet_rename_seq; + static inline void dev_base_seq_inc(struct net *net) { while (++net->dev_base_seq == 0); @@ -233,7 +208,7 @@ static inline void rps_unlock(struct softnet_data *sd) } /* Device list insertion */ -static int list_netdevice(struct net_device *dev) +static void list_netdevice(struct net_device *dev) { struct net *net = dev_net(dev); @@ -247,8 +222,6 @@ static int list_netdevice(struct net_device *dev) write_unlock_bh(&dev_base_lock); dev_base_seq_inc(net); - - return 0; } /* Device list removal @@ -470,6 +443,82 @@ void dev_remove_pack(struct packet_type *pt) } EXPORT_SYMBOL(dev_remove_pack); + +/** + * dev_add_offload - register offload handlers + * @po: protocol offload declaration + * + * Add protocol offload handlers to the networking stack. The passed + * &proto_offload is linked into kernel lists and may not be freed until + * it has been removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + + spin_lock(&offload_lock); + list_add_rcu(&po->list, head); + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + * __dev_remove_offload - remove offload handler + * @po: packet offload declaration + * + * Remove a protocol offload handler that was previously added to the + * kernel offload handlers by dev_add_offload(). The passed &offload_type + * is removed from the kernel lists and can be freed or reused once this + * function returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +void __dev_remove_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + struct packet_offload *po1; + + spin_lock(&offload_lock); + + list_for_each_entry(po1, head, list) { + if (po == po1) { + list_del_rcu(&po->list); + goto out; + } + } + + pr_warn("dev_remove_offload: %p not found\n", po); +out: + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(__dev_remove_offload); + +/** + * dev_remove_offload - remove packet offload handler + * @po: packet offload declaration + * + * Remove a packet offload handler that was previously added to the kernel + * offload handlers by dev_add_offload(). The passed &offload_type is + * removed from the kernel lists and can be freed or reused once this + * function returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ + __dev_remove_offload(po); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); + /****************************************************************************** Device Boot-time Settings Routines @@ -615,11 +664,10 @@ __setup("netdev=", netdev_boot_setup); struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each_entry(dev, p, head, name_hlist) + hlist_for_each_entry(dev, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; @@ -641,11 +689,10 @@ EXPORT_SYMBOL(__dev_get_by_name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each_entry_rcu(dev, p, head, name_hlist) + hlist_for_each_entry_rcu(dev, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; @@ -692,11 +739,10 @@ EXPORT_SYMBOL(dev_get_by_name); struct net_device *__dev_get_by_index(struct net *net, int ifindex) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each_entry(dev, p, head, index_hlist) + hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; @@ -717,11 +763,10 @@ EXPORT_SYMBOL(__dev_get_by_index); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each_entry_rcu(dev, p, head, index_hlist) + hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; @@ -755,6 +800,40 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) EXPORT_SYMBOL(dev_get_by_index); /** + * netdev_get_name - get a netdevice name, knowing its ifindex. + * @net: network namespace + * @name: a pointer to the buffer where the name will be stored. + * @ifindex: the ifindex of the interface to get the name from. + * + * The use of raw_seqcount_begin() and cond_resched() before + * retrying is required as we want to give the writers a chance + * to complete when CONFIG_PREEMPT is not set. + */ +int netdev_get_name(struct net *net, char *name, int ifindex) +{ + struct net_device *dev; + unsigned int seq; + +retry: + seq = raw_seqcount_begin(&devnet_rename_seq); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + + strcpy(name, dev->name); + rcu_read_unlock(); + if (read_seqcount_retry(&devnet_rename_seq, seq)) { + cond_resched(); + goto retry; + } + + return 0; +} + +/** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device @@ -959,18 +1038,30 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -static int dev_get_valid_name(struct net_device *dev, const char *name) +static int dev_alloc_name_ns(struct net *net, + struct net_device *dev, + const char *name) { - struct net *net; + char buf[IFNAMSIZ]; + int ret; - BUG_ON(!dev_net(dev)); - net = dev_net(dev); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} + +static int dev_get_valid_name(struct net *net, + struct net_device *dev, + const char *name) +{ + BUG_ON(!net); if (!dev_valid_name(name)) return -EINVAL; if (strchr(name, '%')) - return dev_alloc_name(dev, name); + return dev_alloc_name_ns(net, dev, name); else if (__dev_get_by_name(net, name)) return -EEXIST; else if (dev->name != name) @@ -1001,22 +1092,31 @@ int dev_change_name(struct net_device *dev, const char *newname) if (dev->flags & IFF_UP) return -EBUSY; - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) + write_seqcount_begin(&devnet_rename_seq); + + if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { + write_seqcount_end(&devnet_rename_seq); return 0; + } memcpy(oldname, dev->name, IFNAMSIZ); - err = dev_get_valid_name(dev, newname); - if (err < 0) + err = dev_get_valid_name(net, dev, newname); + if (err < 0) { + write_seqcount_end(&devnet_rename_seq); return err; + } rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); + write_seqcount_end(&devnet_rename_seq); return ret; } + write_seqcount_end(&devnet_rename_seq); + write_lock_bh(&dev_base_lock); hlist_del_rcu(&dev->name_hlist); write_unlock_bh(&dev_base_lock); @@ -1034,6 +1134,7 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; + write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); goto rollback; } else { @@ -1063,10 +1164,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return -EINVAL; if (!len) { - if (dev->ifalias) { - kfree(dev->ifalias); - dev->ifalias = NULL; - } + kfree(dev->ifalias); + dev->ifalias = NULL; return 0; } @@ -1109,41 +1208,23 @@ void netdev_state_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_state_change); -int netdev_bonding_change(struct net_device *dev, unsigned long event) -{ - return call_netdevice_notifiers(event, dev); -} -EXPORT_SYMBOL(netdev_bonding_change); - /** - * dev_load - load a network module - * @net: the applicable net namespace - * @name: name of interface + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device * - * If a network interface is not present and the process has suitable - * privileges this function loads the module. If module loading is not - * available in this kernel then it becomes a nop. + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. */ - -void dev_load(struct net *net, const char *name) +void netdev_notify_peers(struct net_device *dev) { - struct net_device *dev; - int no_module; - - rcu_read_lock(); - dev = dev_get_by_name_rcu(net, name); - rcu_read_unlock(); - - no_module = !dev; - if (no_module && capable(CAP_NET_ADMIN)) - no_module = request_module("netdev-%s", name); - if (no_module && capable(CAP_SYS_MODULE)) { - if (!request_module("%s", name)) - pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", - name); - } + rtnl_lock(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + rtnl_unlock(); } -EXPORT_SYMBOL(dev_load); +EXPORT_SYMBOL(netdev_notify_peers); static int __dev_open(struct net_device *dev) { @@ -1155,6 +1236,12 @@ static int __dev_open(struct net_device *dev) if (!netif_device_present(dev)) return -ENODEV; + /* Block netpoll from trying to do any rx path servicing. + * If we don't do this there is a chance ndo_poll_controller + * or ndo_poll may be running while we open the device + */ + netpoll_rx_disable(dev); + ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); if (ret) @@ -1168,6 +1255,8 @@ static int __dev_open(struct net_device *dev) if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); + netpoll_rx_enable(dev); + if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { @@ -1259,9 +1348,14 @@ static int __dev_close(struct net_device *dev) int retval; LIST_HEAD(single); + /* Temporarily disable netpoll until the interface is down */ + netpoll_rx_disable(dev); + list_add(&dev->unreg_list, &single); retval = __dev_close_many(&single); list_del(&single); + + netpoll_rx_enable(dev); return retval; } @@ -1300,9 +1394,14 @@ int dev_close(struct net_device *dev) if (dev->flags & IFF_UP) { LIST_HEAD(single); + /* Block netpoll rx while the interface is going down */ + netpoll_rx_disable(dev); + list_add(&dev->unreg_list, &single); dev_close_many(&single); list_del(&single); + + netpoll_rx_enable(dev); } return 0; } @@ -1334,6 +1433,14 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, + struct net_device *dev) +{ + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return nb->notifier_call(nb, val, &info); +} static int dev_boot_phase = 1; @@ -1366,7 +1473,7 @@ int register_netdevice_notifier(struct notifier_block *nb) goto unlock; for_each_net(net) { for_each_netdev(net, dev) { - err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) goto rollback; @@ -1374,7 +1481,7 @@ int register_netdevice_notifier(struct notifier_block *nb) if (!(dev->flags & IFF_UP)) continue; - nb->notifier_call(nb, NETDEV_UP, dev); + call_netdevice_notifier(nb, NETDEV_UP, dev); } } @@ -1390,11 +1497,11 @@ rollback: goto outroll; if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } @@ -1432,11 +1539,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb) for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } unlock: @@ -1446,6 +1553,25 @@ unlock: EXPORT_SYMBOL(unregister_netdevice_notifier); /** + * call_netdevice_notifiers_info - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @info: notifier information data + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, + struct netdev_notifier_info *info) +{ + ASSERT_RTNL(); + netdev_notifier_info_init(info, dev); + return raw_notifier_call_chain(&netdev_chain, val, info); +} +EXPORT_SYMBOL(call_netdevice_notifiers_info); + +/** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function @@ -1456,8 +1582,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - ASSERT_RTNL(); - return raw_notifier_call_chain(&netdev_chain, val, dev); + struct netdev_notifier_info info; + + return call_netdevice_notifiers_info(val, dev, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -1481,7 +1608,6 @@ void net_enable_timestamp(void) return; } #endif - WARN_ON(in_interrupt()); static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); @@ -1511,57 +1637,6 @@ static inline void net_timestamp_set(struct sk_buff *skb) __net_timestamp(SKB); \ } \ -static int net_hwtstamp_validate(struct ifreq *ifr) -{ - struct hwtstamp_config cfg; - enum hwtstamp_tx_types tx_type; - enum hwtstamp_rx_filters rx_filter; - int tx_type_valid = 0; - int rx_filter_valid = 0; - - if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) - return -EFAULT; - - if (cfg.flags) /* reserved for future extensions */ - return -EINVAL; - - tx_type = cfg.tx_type; - rx_filter = cfg.rx_filter; - - switch (tx_type) { - case HWTSTAMP_TX_OFF: - case HWTSTAMP_TX_ON: - case HWTSTAMP_TX_ONESTEP_SYNC: - tx_type_valid = 1; - break; - } - - switch (rx_filter) { - case HWTSTAMP_FILTER_NONE: - case HWTSTAMP_FILTER_ALL: - case HWTSTAMP_FILTER_SOME: - case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: - case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: - case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: - case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: - case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: - case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: - case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: - case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: - case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: - case HWTSTAMP_FILTER_PTP_V2_EVENT: - case HWTSTAMP_FILTER_PTP_V2_SYNC: - case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: - rx_filter_valid = 1; - break; - } - - if (!tx_type_valid || !rx_filter_valid) - return -ERANGE; - - return 0; -} - static inline bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) { @@ -1611,23 +1686,19 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } } - skb_orphan(skb); - nf_reset(skb); - if (unlikely(!is_skb_forwardable(dev, skb))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } - skb->skb_iif = 0; - skb->dev = dev; - skb_dst_drop(skb); - skb->tstamp.tv64 = 0; - skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, dev); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); + + /* eth_type_trans() can set pkt_type. + * call skb_scrub_packet() after it to clear pkt_type _after_ calling + * eth_type_trans(). + */ + skb_scrub_packet(skb, true); + return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1642,6 +1713,19 @@ static inline int deliver_skb(struct sk_buff *skb, return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ + if (!ptype->af_packet_priv || !skb->sk) + return false; + + if (ptype->id_match) + return ptype->id_match(ptype, skb->sk); + else if ((struct sock *)ptype->af_packet_priv == skb->sk) + return true; + + return false; +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1659,8 +1743,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * they originated from - MvS (miquels@drinkel.ow.org) */ if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { + (!skb_loop_sk(ptype, skb))) { if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; @@ -1680,7 +1763,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || - skb2->network_header > skb2->tail) { + skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); @@ -1735,6 +1818,231 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) } } +#ifdef CONFIG_XPS +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, + int cpu, u16 index) +{ + struct xps_map *map = NULL; + int pos; + + if (dev_maps) + map = xmap_dereference(dev_maps->cpu_map[cpu]); + + for (pos = 0; map && pos < map->len; pos++) { + if (map->queues[pos] == index) { + if (map->len > 1) { + map->queues[pos] = map->queues[--map->len]; + } else { + RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); + kfree_rcu(map, rcu); + map = NULL; + } + break; + } + } + + return map; +} + +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ + struct xps_dev_maps *dev_maps; + int cpu, i; + bool active = false; + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (!dev_maps) + goto out_no_maps; + + for_each_possible_cpu(cpu) { + for (i = index; i < dev->num_tx_queues; i++) { + if (!remove_xps_queue(dev_maps, cpu, i)) + break; + } + if (i == dev->num_tx_queues) + active = true; + } + + if (!active) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + + for (i = index; i < dev->num_tx_queues; i++) + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + +out_no_maps: + mutex_unlock(&xps_map_mutex); +} + +static struct xps_map *expand_xps_map(struct xps_map *map, + int cpu, u16 index) +{ + struct xps_map *new_map; + int alloc_len = XPS_MIN_MAP_ALLOC; + int i, pos; + + for (pos = 0; map && pos < map->len; pos++) { + if (map->queues[pos] != index) + continue; + return map; + } + + /* Need to add queue to this CPU's existing map */ + if (map) { + if (pos < map->alloc_len) + return map; + + alloc_len = map->alloc_len * 2; + } + + /* Need to allocate new map to store queue on this CPU's map */ + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(cpu)); + if (!new_map) + return NULL; + + for (i = 0; i < pos; i++) + new_map->queues[i] = map->queues[i]; + new_map->alloc_len = alloc_len; + new_map->len = pos; + + return new_map; +} + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + struct xps_map *map, *new_map; + int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); + int cpu, numa_node_id = -2; + bool active = false; + + mutex_lock(&xps_map_mutex); + + dev_maps = xmap_dereference(dev->xps_maps); + + /* allocate memory for queue storage */ + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mask)) + continue; + + if (!new_dev_maps) + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + NULL; + + map = expand_xps_map(map, cpu, index); + if (!map) + goto error; + + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + } + + if (!new_dev_maps) + goto out_no_new_maps; + + for_each_possible_cpu(cpu) { + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { + /* add queue to CPU maps */ + int pos = 0; + + map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + while ((pos < map->len) && (map->queues[pos] != index)) + pos++; + + if (pos == map->len) + map->queues[map->len++] = index; +#ifdef CONFIG_NUMA + if (numa_node_id == -2) + numa_node_id = cpu_to_node(cpu); + else if (numa_node_id != cpu_to_node(cpu)) + numa_node_id = -1; +#endif + } else if (dev_maps) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[cpu]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + } + + } + + rcu_assign_pointer(dev->xps_maps, new_dev_maps); + + /* Cleanup old maps */ + if (dev_maps) { + for_each_possible_cpu(cpu) { + new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = xmap_dereference(dev_maps->cpu_map[cpu]); + if (map && map != new_map) + kfree_rcu(map, rcu); + } + + kfree_rcu(dev_maps, rcu); + } + + dev_maps = new_dev_maps; + active = true; + +out_no_new_maps: + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? numa_node_id : + NUMA_NO_NODE); + + if (!dev_maps) + goto out_no_maps; + + /* removes queue from unused CPUs */ + for_each_possible_cpu(cpu) { + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) + continue; + + if (remove_xps_queue(dev_maps, cpu, index)) + active = true; + } + + /* free map if not active */ + if (!active) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + +out_no_maps: + mutex_unlock(&xps_map_mutex); + + return 0; +error: + /* remove any maps that we added */ + for_each_possible_cpu(cpu) { + new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } + + mutex_unlock(&xps_map_mutex); + + kfree(new_dev_maps); + return -ENOMEM; +} +EXPORT_SYMBOL(netif_set_xps_queue); + +#endif /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -1758,8 +2066,12 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); - if (txq < dev->real_num_tx_queues) + if (txq < dev->real_num_tx_queues) { qdisc_reset_all_tx_gt(dev, txq); +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, txq); +#endif + } } dev->real_num_tx_queues = txq; @@ -1896,6 +2208,9 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) struct net_device *dev = skb->dev; const char *driver = ""; + if (!net_ratelimit()) + return; + if (dev && dev->dev.parent) driver = dev_driver_string(dev->dev.parent); @@ -1924,6 +2239,15 @@ int skb_checksum_help(struct sk_buff *skb) return -EINVAL; } + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (skb_has_shared_frag(skb)) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); csum = skb_checksum(skb, offset, skb->len - offset, 0); @@ -1946,61 +2270,67 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -/** - * skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, - netdev_features_t features) +__be16 skb_network_protocol(struct sk_buff *skb) { - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_type *ptype; __be16 type = skb->protocol; int vlan_depth = ETH_HLEN; - int err; - while (type == htons(ETH_P_8021Q)) { + /* Tunnel gso handlers can set protocol to ethernet. */ + if (type == htons(ETH_P_TEB)) { + struct ethhdr *eth; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + return 0; + + eth = (struct ethhdr *)skb_mac_header(skb); + type = eth->h_proto; + } + + while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { struct vlan_hdr *vh; if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return ERR_PTR(-EINVAL); + return 0; vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } - skb_reset_mac_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; - __skb_pull(skb, skb->mac_len); + return type; +} - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - skb_warn_bad_offload(skb); +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + __be16 type = skb_network_protocol(skb); - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) - return ERR_PTR(err); - } + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, skb->mac_len); rcu_read_lock(); - list_for_each_entry_rcu(ptype, - &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && !ptype->dev && ptype->gso_segment) { + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - err = ptype->gso_send_check(skb); + int err; + + err = ptype->callbacks.gso_send_check(skb); segs = ERR_PTR(err); if (err || skb_gso_ok(skb, features)) break; __skb_push(skb, (skb->data - skb_network_header(skb))); } - segs = ptype->gso_segment(skb, features); + segs = ptype->callbacks.gso_segment(skb, features); break; } } @@ -2010,7 +2340,50 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, return segs; } -EXPORT_SYMBOL(skb_gso_segment); +EXPORT_SYMBOL(skb_mac_gso_segment); + + +/* openvswitch calls this on rx path, so we need a different check. + */ +static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL; + else + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + skb_warn_bad_offload(skb); + + if (skb_header_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return ERR_PTR(err); + } + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + return skb_mac_gso_segment(skb, features); +} +EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG @@ -2108,23 +2481,12 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) return 0; } -static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) -{ - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_V4_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_V6_CSUM) && - protocol == htons(ETH_P_IPV6)) || - ((features & NETIF_F_FCOE_CRC) && - protocol == htons(ETH_P_FCOE))); -} - static netdev_features_t harmonize_features(struct sk_buff *skb, - __be16 protocol, netdev_features_t features) + netdev_features_t features) { - if (!can_checksum_protocol(features, protocol)) { + if (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, skb_network_protocol(skb))) { features &= ~NETIF_F_ALL_CSUM; - features &= ~NETIF_F_SG; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -2140,34 +2502,32 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; - if (protocol == htons(ETH_P_8021Q)) { + if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, protocol, features); + return harmonize_features(skb, features); } - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX); + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); - if (protocol != htons(ETH_P_8021Q)) { - return harmonize_features(skb, protocol, features); - } else { + if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX; - return harmonize_features(skb, protocol, features); - } + NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + + return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); /* * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or - * 2. skb is fragmented and the device does not support SG, or if - * at least one of fragments is in highmem and device does not - * support DMA from it. + * 2. skb is fragmented and the device does not support SG. */ static inline int skb_needs_linearize(struct sk_buff *skb, - int features) + netdev_features_t features) { return skb_is_nonlinear(skb) && ((skb_has_frag_list(skb) && @@ -2193,20 +2553,25 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); - features = netif_skb_features(skb); if (vlan_tx_tag_present(skb) && - !(features & NETIF_F_HW_VLAN_TX)) { - skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); if (unlikely(!skb)) goto out; skb->vlan_tci = 0; } + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + if (netif_needs_gso(skb, features)) { if (unlikely(dev_gso_segment(skb, features))) goto out_kfree_skb; @@ -2222,14 +2587,21 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); if (!(features & NETIF_F_ALL_CSUM) && skb_checksum_help(skb)) goto out_kfree_skb; } } + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + skb_len = skb->len; rc = ops->ndo_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); @@ -2245,12 +2617,8 @@ gso: skb->next = nskb->next; nskb->next = NULL; - /* - * If device doesn't need nskb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(nskb); + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(nskb, dev); skb_len = nskb->len; rc = ops->ndo_start_xmit(nskb, dev); @@ -2268,134 +2636,45 @@ gso: } while (skb->next); out_kfree_gso_skb: - if (likely(skb->next == NULL)) + if (likely(skb->next == NULL)) { skb->destructor = DEV_GSO_CB(skb)->destructor; + consume_skb(skb); + return rc; + } out_kfree_skb: kfree_skb(skb); out: return rc; } -static u32 hashrnd __read_mostly; - -/* - * Returns a Tx hash based on the given packet descriptor a Tx queues' number - * to be used as a distribution range. - */ -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, - unsigned int num_tx_queues) -{ - u32 hash; - u16 qoffset = 0; - u16 qcount = num_tx_queues; - - if (skb_rx_queue_recorded(skb)) { - hash = skb_get_rx_queue(skb); - while (unlikely(hash >= num_tx_queues)) - hash -= num_tx_queues; - return hash; - } - - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; - } - - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol; - hash = jhash_1word(hash, hashrnd); - - return (u16) (((u64) hash * qcount) >> 32) + qoffset; -} -EXPORT_SYMBOL(__skb_tx_hash); - -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +static void qdisc_pkt_len_init(struct sk_buff *skb) { - if (unlikely(queue_index >= dev->real_num_tx_queues)) { - net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); - return 0; - } - return queue_index; -} - -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) -{ -#ifdef CONFIG_XPS - struct xps_dev_maps *dev_maps; - struct xps_map *map; - int queue_index = -1; - - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); - if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[raw_smp_processor_id()]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else { - u32 hash; - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol ^ - skb->rxhash; - hash = jhash_1word(hash, hashrnd); - queue_index = map->queues[ - ((u64)hash * map->len) >> 32]; - } - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; - } - } - rcu_read_unlock(); + const struct skb_shared_info *shinfo = skb_shinfo(skb); - return queue_index; -#else - return -1; -#endif -} - -static struct netdev_queue *dev_pick_tx(struct net_device *dev, - struct sk_buff *skb) -{ - int queue_index; - const struct net_device_ops *ops = dev->netdev_ops; + qdisc_skb_cb(skb)->pkt_len = skb->len; - if (dev->real_num_tx_queues == 1) - queue_index = 0; - else if (ops->ndo_select_queue) { - queue_index = ops->ndo_select_queue(dev, skb); - queue_index = dev_cap_txqueue(dev, queue_index); - } else { - struct sock *sk = skb->sk; - queue_index = sk_tx_queue_get(sk); + /* To get more precise estimation of bytes sent on wire, + * we add to pkt_len the headers size of all segments + */ + if (shinfo->gso_size) { + unsigned int hdr_len; + u16 gso_segs = shinfo->gso_segs; - if (queue_index < 0 || skb->ooo_okay || - queue_index >= dev->real_num_tx_queues) { - int old_index = queue_index; + /* mac layer + network layer */ + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - queue_index = get_xps_queue(dev, skb); - if (queue_index < 0) - queue_index = skb_tx_hash(dev, skb); + /* + transport layer */ + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + hdr_len += tcp_hdrlen(skb); + else + hdr_len += sizeof(struct udphdr); - if (queue_index != old_index && sk) { - struct dst_entry *dst = - rcu_dereference_check(sk->sk_dst_cache, 1); + if (shinfo->gso_type & SKB_GSO_DODGY) + gso_segs = DIV_ROUND_UP(skb->len - hdr_len, + shinfo->gso_size); - if (dst && skb_dst(skb) == dst) - sk_tx_queue_set(sk, queue_index); - } - } + qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } - - skb_set_queue_mapping(skb, queue_index); - return netdev_get_tx_queue(dev, queue_index); } static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, @@ -2406,7 +2685,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, bool contended; int rc; - qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_pkt_len_init(skb); qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a @@ -2529,6 +2808,8 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; + skb_reset_mac_header(skb); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ @@ -2536,7 +2817,7 @@ int dev_queue_xmit(struct sk_buff *skb) skb_update_prio(skb); - txq = dev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb); q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT @@ -2609,6 +2890,8 @@ EXPORT_SYMBOL(dev_queue_xmit); =======================================================================*/ int netdev_max_backlog __read_mostly = 1000; +EXPORT_SYMBOL(netdev_max_backlog); + int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ @@ -2621,40 +2904,6 @@ static inline void ____napi_schedule(struct softnet_data *sd, __raise_softirq_irqoff(NET_RX_SOFTIRQ); } -/* - * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers. Sets rxhash in skb to non-zero hash value - * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb - * if hash is a canonical 4-tuple hash over transport ports. - */ -void __skb_get_rxhash(struct sk_buff *skb) -{ - struct flow_keys keys; - u32 hash; - - if (!skb_flow_dissect(skb, &keys)) - return; - - if (keys.ports) { - if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]) - swap(keys.port16[0], keys.port16[1]); - skb->l4_rxhash = 1; - } - - /* get a consistent hash (same value on both flow directions) */ - if ((__force u32)keys.dst < (__force u32)keys.src) - swap(keys.dst, keys.src); - - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, hashrnd); - if (!hash) - hash = 1; - - skb->rxhash = hash; -} -EXPORT_SYMBOL(__skb_get_rxhash); - #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ @@ -2779,8 +3028,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) + rflow->last_qtail)) >= 0)) { + tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + } if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { *rflowp = rflow; @@ -2874,6 +3125,46 @@ static int rps_ipi_queued(struct softnet_data *sd) return 0; } +#ifdef CONFIG_NET_FLOW_LIMIT +int netdev_flow_limit_table_len __read_mostly = (1 << 12); +#endif + +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +{ +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + struct softnet_data *sd; + unsigned int old_flow, new_flow; + + if (qlen < (netdev_max_backlog >> 1)) + return false; + + sd = &__get_cpu_var(softnet_data); + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) { + new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); + old_flow = fl->history[fl->history_head]; + fl->history[fl->history_head] = new_flow; + + fl->history_head++; + fl->history_head &= FLOW_LIMIT_HISTORY - 1; + + if (likely(fl->buckets[old_flow])) + fl->buckets[old_flow]--; + + if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { + fl->count++; + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); +#endif + return false; +} + /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). @@ -2883,13 +3174,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, { struct softnet_data *sd; unsigned long flags; + unsigned int qlen; sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); rps_lock(sd); - if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -3138,6 +3431,7 @@ int netdev_rx_handler_register(struct net_device *dev, if (dev->rx_handler) return -EBUSY; + /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); @@ -3149,7 +3443,7 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_register); * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * - * Unregister a receive hander from a device. + * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex. */ @@ -3158,6 +3452,11 @@ void netdev_rx_handler_unregister(struct net_device *dev) ASSERT_RTNL(); RCU_INIT_POINTER(dev->rx_handler, NULL); + /* a reader seeing a non NULL rx_handler in a rcu_read_lock() + * section has a guarantee to see a non NULL rx_handler_data + * as well. + */ + synchronize_net(); RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); @@ -3173,13 +3472,14 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) case __constant_htons(ETH_P_IP): case __constant_htons(ETH_P_IPV6): case __constant_htons(ETH_P_8021Q): + case __constant_htons(ETH_P_8021AD): return true; default: return false; } } -static int __netif_receive_skb(struct sk_buff *skb) +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; @@ -3188,24 +3488,11 @@ static int __netif_receive_skb(struct sk_buff *skb) bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; - unsigned long pflags = current->flags; net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); - /* - * PFMEMALLOC skbs are special, they should - * - be delivered to SOCK_MEMALLOC sockets only - * - stay away from userspace - * - have bounded memory usage - * - * Use PF_MEMALLOC as this saves us from propagating the allocation - * context down to all allocation sites. - */ - if (sk_memalloc_socks() && skb_pfmemalloc(skb)) - current->flags |= PF_MEMALLOC; - /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) goto out; @@ -3213,7 +3500,8 @@ static int __netif_receive_skb(struct sk_buff *skb) orig_dev = skb->dev; skb_reset_network_header(skb); - skb_reset_transport_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); skb_reset_mac_len(skb); pt_prev = NULL; @@ -3225,7 +3513,8 @@ another_round: __this_cpu_inc(softnet_data.processed); - if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { + if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || + skb->protocol == cpu_to_be16(ETH_P_8021AD)) { skb = vlan_untag(skb); if (unlikely(!skb)) goto unlock; @@ -3238,7 +3527,7 @@ another_round: } #endif - if (sk_memalloc_socks() && skb_pfmemalloc(skb)) + if (pfmemalloc) goto skip_taps; list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -3257,22 +3546,21 @@ skip_taps: ncls: #endif - if (sk_memalloc_socks() && skb_pfmemalloc(skb) - && !skb_pfmemalloc_protocol(skb)) + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; - rx_handler = rcu_dereference(skb->dev->rx_handler); if (vlan_tx_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - if (vlan_do_receive(&skb, !rx_handler)) + if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto unlock; } + rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); @@ -3280,6 +3568,7 @@ ncls: } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: + ret = NET_RX_SUCCESS; goto unlock; case RX_HANDLER_ANOTHER: goto another_round; @@ -3292,6 +3581,16 @@ ncls: } } + if (unlikely(vlan_tx_tag_present(skb))) { + if (vlan_tx_tag_get_id(skb)) + skb->pkt_type = PACKET_OTHERHOST; + /* Note: we might in the future use prio bits + * and set skb->priority like in vlan_do_receive() + * For the time being, just ignore Priority Code Point + */ + skb->vlan_tci = 0; + } + /* deliver only exact match when indicated */ null_or_dev = deliver_exact ? skb->dev : NULL; @@ -3309,7 +3608,7 @@ ncls: if (pt_prev) { if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) - ret = -ENOMEM; + goto drop; else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { @@ -3325,7 +3624,31 @@ drop: unlock: rcu_read_unlock(); out: - tsk_restore_flags(current, pflags, PF_MEMALLOC); + return ret; +} + +static int __netif_receive_skb(struct sk_buff *skb) +{ + int ret; + + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { + unsigned long pflags = current->flags; + + /* + * PFMEMALLOC skbs are special, they should + * - be delivered to SOCK_MEMALLOC sockets only + * - stay away from userspace + * - have bounded memory usage + * + * Use PF_MEMALLOC as this saves us from propagating the allocation + * context down to all allocation sites. + */ + current->flags |= PF_MEMALLOC; + ret = __netif_receive_skb_core(skb, true); + tsk_restore_flags(current, pflags, PF_MEMALLOC); + } else + ret = __netif_receive_skb_core(skb, false); + return ret; } @@ -3402,11 +3725,13 @@ static void flush_backlog(void *arg) static int napi_gro_complete(struct sk_buff *skb) { - struct packet_type *ptype; + struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + struct list_head *head = &offload_base; int err = -ENOENT; + BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + if (NAPI_GRO_CB(skb)->count == 1) { skb_shinfo(skb)->gso_size = 0; goto out; @@ -3414,10 +3739,10 @@ static int napi_gro_complete(struct sk_buff *skb) rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || ptype->dev || !ptype->gro_complete) + if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->gro_complete(skb); + err = ptype->callbacks.gro_complete(skb); break; } rcu_read_unlock(); @@ -3432,29 +3757,64 @@ out: return netif_receive_skb(skb); } -inline void napi_gro_flush(struct napi_struct *napi) +/* napi->gro_list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - struct sk_buff *skb, *next; + struct sk_buff *skb, *prev = NULL; - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; + /* scan list and build reverse chain */ + for (skb = napi->gro_list; skb != NULL; skb = skb->next) { + skb->prev = prev; + prev = skb; + } + + for (skb = prev; skb; skb = prev) { skb->next = NULL; + + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + + prev = skb->prev; napi_gro_complete(skb); + napi->gro_count--; } - napi->gro_count = 0; napi->gro_list = NULL; } EXPORT_SYMBOL(napi_gro_flush); -enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; + + for (p = napi->gro_list; p; p = p->next) { + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_gro_mac_header(skb), + maclen); + NAPI_GRO_CB(p)->same_flow = !diffs; + NAPI_GRO_CB(p)->flush = 0; + } +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; - struct packet_type *ptype; + struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + struct list_head *head = &offload_base; int same_flow; - int mac_len; enum gro_result ret; if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) @@ -3463,19 +3823,20 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; + gro_list_prepare(napi, skb); + rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || ptype->dev || !ptype->gro_receive) + if (ptype->type != type || !ptype->callbacks.gro_receive) continue; skb_set_network_header(skb, skb_gro_offset(skb)); - mac_len = skb->network_header - skb->mac_header; - skb->mac_len = mac_len; + skb_reset_mac_len(skb); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; - pp = ptype->gro_receive(&napi->gro_list, skb); + pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; } rcu_read_unlock(); @@ -3503,6 +3864,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) napi->gro_count++; NAPI_GRO_CB(skb)->count = 1; + NAPI_GRO_CB(skb)->age = jiffies; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; @@ -3537,34 +3899,9 @@ normal: ret = GRO_NORMAL; goto pull; } -EXPORT_SYMBOL(dev_gro_receive); - -static inline gro_result_t -__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - struct sk_buff *p; - unsigned int maclen = skb->dev->hard_header_len; - - for (p = napi->gro_list; p; p = p->next) { - unsigned long diffs; - diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; - if (maclen == ETH_HLEN) - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); - else if (!diffs) - diffs = memcmp(skb_mac_header(p), - skb_gro_mac_header(skb), - maclen); - NAPI_GRO_CB(p)->same_flow = !diffs; - NAPI_GRO_CB(p)->flush = 0; - } - - return dev_gro_receive(napi, skb); -} -gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { switch (ret) { case GRO_NORMAL: @@ -3590,28 +3927,29 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) return ret; } -EXPORT_SYMBOL(napi_skb_finish); -void skb_gro_reset_offset(struct sk_buff *skb) +static void skb_gro_reset_offset(struct sk_buff *skb) { + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - if (skb->mac_header == skb->tail && - !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { - NAPI_GRO_CB(skb)->frag0 = - skb_frag_address(&skb_shinfo(skb)->frags[0]); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]); + if (skb_mac_header(skb) == skb_tail_pointer(skb) && + pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); } } -EXPORT_SYMBOL(skb_gro_reset_offset); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_gro_reset_offset(skb); - return napi_skb_finish(__napi_gro_receive(napi, skb), skb); + return napi_skb_finish(dev_gro_receive(napi, skb), skb); } EXPORT_SYMBOL(napi_gro_receive); @@ -3640,7 +3978,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, +static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { @@ -3665,7 +4003,6 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, return ret; } -EXPORT_SYMBOL(napi_frags_finish); static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { @@ -3710,7 +4047,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) if (!skb) return GRO_DROP; - return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); + return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); } EXPORT_SYMBOL(napi_gro_frags); @@ -3837,13 +4174,65 @@ void napi_complete(struct napi_struct *n) if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) return; - napi_gro_flush(n); + napi_gro_flush(n, false); local_irq_save(flags); __napi_complete(n); local_irq_restore(flags); } EXPORT_SYMBOL(napi_complete); +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ + if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + + spin_lock(&napi_hash_lock); + + /* 0 is not a valid id, we also skip an id that is taken + * we expect both events to be extremely rare + */ + napi->napi_id = 0; + while (!napi->napi_id) { + napi->napi_id = ++napi_gen_id; + if (napi_by_id(napi->napi_id)) + napi->napi_id = 0; + } + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); + } +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + hlist_del_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -3852,6 +4241,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->gro_list = NULL; napi->skb = NULL; napi->poll = poll; + if (weight > NAPI_POLL_WEIGHT) + pr_err_once("netif_napi_add() called with weight %d on device %s\n", + weight, dev->name); napi->weight = weight; list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; @@ -3898,7 +4290,7 @@ static void net_rx_action(struct softirq_action *h) * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ - if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) + if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; local_irq_enable(); @@ -3942,8 +4334,17 @@ static void net_rx_action(struct softirq_action *h) local_irq_enable(); napi_complete(n); local_irq_disable(); - } else + } else { + if (n->gro_list) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + local_irq_enable(); + napi_gro_flush(n, HZ >= 1000); + local_irq_disable(); + } list_move_tail(&n->poll_list, &sd->poll_list); + } } netpoll_poll_unlock(have); @@ -3967,525 +4368,452 @@ softnet_break: goto out; } -static gifconf_func_t *gifconf_list[NPROTO]; +struct netdev_adjacent { + struct net_device *dev; -/** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler - * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) -{ - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} -EXPORT_SYMBOL(register_gifconf); + /* upper master flag, there can only be one master device per list */ + bool master; + /* indicates that this dev is our first-level lower/upper device */ + bool neighbour; -/* - * Map an interface index to its name (SIOCGIFNAME) - */ + /* counter for the number of times this device was added to us */ + u16 ref_nr; -/* - * We need this ioctl for efficient implementation of the - * if_indextoname() function required by the IPv6 API. Without - * it, we would have to search all the interfaces to find a - * match. --pb - */ + struct list_head list; + struct rcu_head rcu; +}; -static int dev_ifname(struct net *net, struct ifreq __user *arg) +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, + struct net_device *adj_dev, + bool upper) { - struct net_device *dev; - struct ifreq ifr; + struct netdev_adjacent *adj; + struct list_head *dev_list; - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; + dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); - if (!dev) { - rcu_read_unlock(); - return -ENODEV; + list_for_each_entry(adj, dev_list, list) { + if (adj->dev == adj_dev) + return adj; } + return NULL; +} - strcpy(ifr.ifr_name, dev->name); - rcu_read_unlock(); +static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_find_adj(dev, udev, true); +} - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; +static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_find_adj(dev, ldev, false); } -/* - * Perform a SIOCGIFCONF call. This structure will change - * size eventually, and there is nothing I can do about it. - * Thus we will need a 'compatibility mode'. +/** + * netdev_has_upper_dev - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks only immediate upper device, + * not through a complete stack of devices. The caller must hold the RTNL lock. */ - -static int dev_ifconf(struct net *net, char __user *arg) +bool netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) { - struct ifconf ifc; - struct net_device *dev; - char __user *pos; - int len; - int total; - int i; + ASSERT_RTNL(); - /* - * Fetch the caller's info block. - */ + return __netdev_find_upper(dev, upper_dev); +} +EXPORT_SYMBOL(netdev_has_upper_dev); - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; +/** + * netdev_has_any_upper_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to an upper device and return true in case + * it is. The caller must hold the RTNL lock. + */ +bool netdev_has_any_upper_dev(struct net_device *dev) +{ + ASSERT_RTNL(); - pos = ifc.ifc_buf; - len = ifc.ifc_len; + return !list_empty(&dev->upper_dev_list); +} +EXPORT_SYMBOL(netdev_has_any_upper_dev); - /* - * Loop over the interfaces, and write an info block for each. - */ +/** + * netdev_master_upper_dev_get - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RTNL lock. + */ +struct net_device *netdev_master_upper_dev_get(struct net_device *dev) +{ + struct netdev_adjacent *upper; - total = 0; - for_each_netdev(net, dev) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0); - else - done = gifconf_list[i](dev, pos + total, - len - total); - if (done < 0) - return -EFAULT; - total += done; - } - } - } + ASSERT_RTNL(); - /* - * All done. Write the updated control block back to the caller. - */ - ifc.ifc_len = total; + if (list_empty(&dev->upper_dev_list)) + return NULL; - /* - * Both BSD and Solaris return 0 here, so we do too. - */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; + upper = list_first_entry(&dev->upper_dev_list, + struct netdev_adjacent, list); + if (likely(upper->master)) + return upper->dev; + return NULL; } +EXPORT_SYMBOL(netdev_master_upper_dev_get); -#ifdef CONFIG_PROC_FS +/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; -#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) + WARN_ON_ONCE(!rcu_read_lock_held()); -#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) -#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); -static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) -{ - struct net *net = seq_file_net(seq); - struct net_device *dev; - struct hlist_node *p; - struct hlist_head *h; - unsigned int count = 0, offset = get_offset(*pos); + if (&upper->list == &dev->upper_dev_list) + return NULL; - h = &net->dev_name_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, p, h, name_hlist) { - if (++count == offset) - return dev; - } + *iter = &upper->list; - return NULL; + return upper->dev; } +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); -static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) +/** + * netdev_master_upper_dev_get_rcu - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RCU read lock. + */ +struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - struct net_device *dev; - unsigned int bucket; - - do { - dev = dev_from_same_bucket(seq, pos); - if (dev) - return dev; - - bucket = get_bucket(*pos) + 1; - *pos = set_bucket_offset(bucket, 1); - } while (bucket < NETDEV_HASHENTRIES); + struct netdev_adjacent *upper; + upper = list_first_or_null_rcu(&dev->upper_dev_list, + struct netdev_adjacent, list); + if (upper && likely(upper->master)) + return upper->dev; return NULL; } +EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. - */ -void *dev_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + bool neighbour, bool master, + bool upper) { - rcu_read_lock(); - if (!*pos) - return SEQ_START_TOKEN; + struct netdev_adjacent *adj; - if (get_bucket(*pos) >= NETDEV_HASHENTRIES) - return NULL; + adj = __netdev_find_adj(dev, adj_dev, upper); - return dev_from_bucket(seq, pos); -} + if (adj) { + BUG_ON(neighbour); + adj->ref_nr++; + return 0; + } -void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return dev_from_bucket(seq, pos); -} + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; -void dev_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} + adj->dev = adj_dev; + adj->master = master; + adj->neighbour = neighbour; + adj->ref_nr = 1; -static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) -{ - struct rtnl_link_stats64 temp; - const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + dev_hold(adj_dev); + pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); - seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " - "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", - dev->name, stats->rx_bytes, stats->rx_packets, - stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors + - stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, stats->tx_packets, - stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + - stats->tx_aborted_errors + - stats->tx_window_errors + - stats->tx_heartbeat_errors, - stats->tx_compressed); -} + if (!upper) { + list_add_tail_rcu(&adj->list, &dev->lower_dev_list); + return 0; + } -/* - * Called from the PROCfs module. This now uses the new arbitrary sized - * /proc/net interface to create /proc/net/dev - */ -static int dev_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Inter-| Receive " - " | Transmit\n" - " face |bytes packets errs drop fifo frame " - "compressed multicast|bytes packets errs " - "drop fifo colls carrier compressed\n"); + /* Ensure that master upper link is always the first item in list. */ + if (master) + list_add_rcu(&adj->list, &dev->upper_dev_list); else - dev_seq_printf_stats(seq, v); + list_add_tail_rcu(&adj->list, &dev->upper_dev_list); + return 0; } -static struct softnet_data *softnet_get_online(loff_t *pos) +static inline int __netdev_upper_dev_insert(struct net_device *dev, + struct net_device *udev, + bool master, bool neighbour) { - struct softnet_data *sd = NULL; - - while (*pos < nr_cpu_ids) - if (cpu_online(*pos)) { - sd = &per_cpu(softnet_data, *pos); - break; - } else - ++*pos; - return sd; + return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, + true); } -static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +static inline int __netdev_lower_dev_insert(struct net_device *dev, + struct net_device *ldev, + bool neighbour) { - return softnet_get_online(pos); + return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, + false); } -static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, bool upper) { - ++*pos; - return softnet_get_online(pos); -} + struct netdev_adjacent *adj; -static void softnet_seq_stop(struct seq_file *seq, void *v) -{ -} + if (upper) + adj = __netdev_find_upper(dev, adj_dev); + else + adj = __netdev_find_lower(dev, adj_dev); -static int softnet_seq_show(struct seq_file *seq, void *v) -{ - struct softnet_data *sd = v; + if (!adj) + BUG(); - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, - 0, 0, 0, 0, /* was fastroute */ - sd->cpu_collision, sd->received_rps); - return 0; -} + if (adj->ref_nr > 1) { + adj->ref_nr--; + return; + } -static const struct seq_operations dev_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_seq_show, -}; + list_del_rcu(&adj->list); + pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + dev_put(adj_dev); + kfree_rcu(adj, rcu); +} -static int dev_seq_open(struct inode *inode, struct file *file) +static inline void __netdev_upper_dev_remove(struct net_device *dev, + struct net_device *udev) { - return seq_open_net(inode, file, &dev_seq_ops, - sizeof(struct seq_net_private)); + return __netdev_adjacent_dev_remove(dev, udev, true); } -static const struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, - .open = dev_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static const struct seq_operations softnet_seq_ops = { - .start = softnet_seq_start, - .next = softnet_seq_next, - .stop = softnet_seq_stop, - .show = softnet_seq_show, -}; - -static int softnet_seq_open(struct inode *inode, struct file *file) +static inline void __netdev_lower_dev_remove(struct net_device *dev, + struct net_device *ldev) { - return seq_open(file, &softnet_seq_ops); + return __netdev_adjacent_dev_remove(dev, ldev, false); } -static const struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, - .open = softnet_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void *ptype_get_idx(loff_t pos) +int __netdev_adjacent_dev_insert_link(struct net_device *dev, + struct net_device *upper_dev, + bool master, bool neighbour) { - struct packet_type *pt = NULL; - loff_t i = 0; - int t; + int ret; - list_for_each_entry_rcu(pt, &ptype_all, list) { - if (i == pos) - return pt; - ++i; - } + ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); + if (ret) + return ret; - for (t = 0; t < PTYPE_HASH_SIZE; t++) { - list_for_each_entry_rcu(pt, &ptype_base[t], list) { - if (i == pos) - return pt; - ++i; - } + ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); + if (ret) { + __netdev_upper_dev_remove(dev, upper_dev); + return ret; } - return NULL; + + return 0; } -static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) +static inline int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *udev) { - rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; + return __netdev_adjacent_dev_insert_link(dev, udev, false, false); } -static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *udev, + bool master) { - struct packet_type *pt; - struct list_head *nxt; - int hash; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); - - pt = v; - nxt = pt->list.next; - if (pt->type == htons(ETH_P_ALL)) { - if (nxt != &ptype_all) - goto found; - hash = 0; - nxt = ptype_base[0].next; - } else - hash = ntohs(pt->type) & PTYPE_HASH_MASK; - - while (nxt == &ptype_base[hash]) { - if (++hash >= PTYPE_HASH_SIZE) - return NULL; - nxt = ptype_base[hash].next; - } -found: - return list_entry(nxt, struct packet_type, list); + return __netdev_adjacent_dev_insert_link(dev, udev, master, true); } -static void ptype_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) +void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) { - rcu_read_unlock(); + __netdev_upper_dev_remove(dev, upper_dev); + __netdev_lower_dev_remove(upper_dev, dev); } -static int ptype_seq_show(struct seq_file *seq, void *v) + +static int __netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev, bool master) { - struct packet_type *pt = v; + struct netdev_adjacent *i, *j, *to_i, *to_j; + int ret = 0; - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { - if (pt->type == htons(ETH_P_ALL)) - seq_puts(seq, "ALL "); - else - seq_printf(seq, "%04x", ntohs(pt->type)); + ASSERT_RTNL(); + + if (dev == upper_dev) + return -EBUSY; + + /* To prevent loops, check if dev is not upper device to upper_dev. */ + if (__netdev_find_upper(upper_dev, dev)) + return -EBUSY; + + if (__netdev_find_upper(dev, upper_dev)) + return -EEXIST; - seq_printf(seq, " %-8s %pF\n", - pt->dev ? pt->dev->name : "", pt->func); + if (master && netdev_master_upper_dev_get(dev)) + return -EBUSY; + + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); + if (ret) + return ret; + + /* Now that we linked these devs, make all the upper_dev's + * upper_dev_list visible to every dev's lower_dev_list and vice + * versa, and don't forget the devices itself. All of these + * links are non-neighbours. + */ + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, j->dev); + if (ret) + goto rollback_mesh; + } } - return 0; -} + /* add dev to every upper_dev's upper device */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + ret = __netdev_adjacent_dev_link(dev, i->dev); + if (ret) + goto rollback_upper_mesh; + } -static const struct seq_operations ptype_seq_ops = { - .start = ptype_seq_start, - .next = ptype_seq_next, - .stop = ptype_seq_stop, - .show = ptype_seq_show, -}; + /* add upper_dev to every dev's lower device */ + list_for_each_entry(i, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + if (ret) + goto rollback_lower_mesh; + } -static int ptype_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &ptype_seq_ops, - sizeof(struct seq_net_private)); -} + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + return 0; -static const struct file_operations ptype_seq_fops = { - .owner = THIS_MODULE, - .open = ptype_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; +rollback_lower_mesh: + to_i = i; + list_for_each_entry(i, &dev->lower_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + } + i = NULL; -static int __net_init dev_proc_net_init(struct net *net) -{ - int rc = -ENOMEM; +rollback_upper_mesh: + to_i = i; + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(dev, i->dev); + } - if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) - goto out; - if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) - goto out_dev; - if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) - goto out_softnet; + i = j = NULL; - if (wext_proc_init(net)) - goto out_ptype; - rc = 0; -out: - return rc; -out_ptype: - proc_net_remove(net, "ptype"); -out_softnet: - proc_net_remove(net, "softnet_stat"); -out_dev: - proc_net_remove(net, "dev"); - goto out; -} +rollback_mesh: + to_i = i; + to_j = j; + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + if (i == to_i && j == to_j) + break; + __netdev_adjacent_dev_unlink(i->dev, j->dev); + } + if (i == to_i) + break; + } -static void __net_exit dev_proc_net_exit(struct net *net) -{ - wext_proc_exit(net); + __netdev_adjacent_dev_unlink(dev, upper_dev); - proc_net_remove(net, "ptype"); - proc_net_remove(net, "softnet_stat"); - proc_net_remove(net, "dev"); + return ret; } -static struct pernet_operations __net_initdata dev_proc_ops = { - .init = dev_proc_net_init, - .exit = dev_proc_net_exit, -}; - -static int __init dev_proc_init(void) +/** + * netdev_upper_dev_link - Add a link to the upper device + * @dev: device + * @upper_dev: new upper device + * + * Adds a link to device which is upper to this one. The caller must hold + * the RTNL lock. On a failure a negative errno code is returned. + * On success the reference counts are adjusted and the function + * returns zero. + */ +int netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - return register_pernet_subsys(&dev_proc_ops); + return __netdev_upper_dev_link(dev, upper_dev, false); } -#else -#define dev_proc_init() 0 -#endif /* CONFIG_PROC_FS */ - +EXPORT_SYMBOL(netdev_upper_dev_link); /** - * netdev_set_master - set up master pointer - * @slave: slave device - * @master: new master device + * netdev_master_upper_dev_link - Add a master link to the upper device + * @dev: device + * @upper_dev: new upper device * - * Changes the master device of the slave. Pass %NULL to break the - * bonding. The caller must hold the RTNL semaphore. On a failure - * a negative errno code is returned. On success the reference counts - * are adjusted and the function returns zero. + * Adds a link to device which is upper to this one. In this case, only + * one master upper device can be linked, although other non-master devices + * might be linked as well. The caller must hold the RTNL lock. + * On a failure a negative errno code is returned. On success the reference + * counts are adjusted and the function returns zero. */ -int netdev_set_master(struct net_device *slave, struct net_device *master) +int netdev_master_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - struct net_device *old = slave->master; - - ASSERT_RTNL(); - - if (master) { - if (old) - return -EBUSY; - dev_hold(master); - } - - slave->master = master; - - if (old) - dev_put(old); - return 0; + return __netdev_upper_dev_link(dev, upper_dev, true); } -EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_master_upper_dev_link); /** - * netdev_set_bond_master - set up bonding master/slave pair - * @slave: slave device - * @master: new master device + * netdev_upper_dev_unlink - Removes a link to upper device + * @dev: device + * @upper_dev: new upper device * - * Changes the master device of the slave. Pass %NULL to break the - * bonding. The caller must hold the RTNL semaphore. On a failure - * a negative errno code is returned. On success %RTM_NEWLINK is sent - * to the routing socket and the function returns zero. + * Removes a link to device which is upper to this one. The caller must hold + * the RTNL lock. */ -int netdev_set_bond_master(struct net_device *slave, struct net_device *master) +void netdev_upper_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) { - int err; - + struct netdev_adjacent *i, *j; ASSERT_RTNL(); - err = netdev_set_master(slave, master); - if (err) - return err; - if (master) - slave->flags |= IFF_SLAVE; - else - slave->flags &= ~IFF_SLAVE; + __netdev_adjacent_dev_unlink(dev, upper_dev); - rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); - return 0; + /* Here is the tricky part. We must remove all dev's lower + * devices from all upper_dev's upper devices and vice + * versa, to maintain the graph relationship. + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + list_for_each_entry(j, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, j->dev); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + + list_for_each_entry(i, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(dev, i->dev); + + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } -EXPORT_SYMBOL(netdev_set_bond_master); +EXPORT_SYMBOL(netdev_upper_dev_unlink); static void dev_change_rx_flags(struct net_device *dev, int flags) { @@ -4498,8 +4826,8 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; ASSERT_RTNL(); @@ -4530,8 +4858,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), - audit_get_loginuid(current), - uid, gid, + from_kuid(&init_user_ns, audit_get_loginuid(current)), + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); } @@ -4753,8 +5082,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) } if (dev->flags & IFF_UP && - (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = changes; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + } } /** @@ -4811,7 +5145,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) else dev->mtu = new_mtu; - if (!err && dev->flags & IFF_UP) + if (!err) call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); return err; } @@ -4847,372 +5181,51 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) if (!netif_device_present(dev)) return -ENODEV; err = ops->ndo_set_mac_address(dev, sa); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + if (err) + return err; + dev->addr_assign_type = NET_ADDR_SET; + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); - return err; + return 0; } EXPORT_SYMBOL(dev_set_mac_address); -/* - * Perform the SIOCxIFxxx calls, inside rcu_read_lock() - */ -static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) -{ - int err; - struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); - - if (!dev) - return -ENODEV; - - switch (cmd) { - case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = (short) dev_get_flags(dev); - return 0; - - case SIOCGIFMETRIC: /* Get the metric on the interface - (currently unused) */ - ifr->ifr_metric = 0; - return 0; - - case SIOCGIFMTU: /* Get the MTU of a device */ - ifr->ifr_mtu = dev->mtu; - return 0; - - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; - - case SIOCGIFSLAVE: - err = -EINVAL; - break; - - case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; - - case SIOCGIFINDEX: - ifr->ifr_ifindex = dev->ifindex; - return 0; - - case SIOCGIFTXQLEN: - ifr->ifr_qlen = dev->tx_queue_len; - return 0; - - default: - /* dev_ioctl() should ensure this case - * is never reached - */ - WARN_ON(1); - err = -ENOTTY; - break; - - } - return err; -} - -/* - * Perform the SIOCxIFxxx calls, inside rtnl_lock() +/** + * dev_change_carrier - Change device carrier + * @dev: device + * @new_carrier: new value + * + * Change device carrier */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +int dev_change_carrier(struct net_device *dev, bool new_carrier) { - int err; - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - const struct net_device_ops *ops; - - if (!dev) - return -ENODEV; - - ops = dev->netdev_ops; - - switch (cmd) { - case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + const struct net_device_ops *ops = dev->netdev_ops; - case SIOCSIFMETRIC: /* Set the metric on the interface - (currently unused) */ + if (!ops->ndo_change_carrier) return -EOPNOTSUPP; - - case SIOCSIFMTU: /* Set the MTU of a device */ - return dev_set_mtu(dev, ifr->ifr_mtu); - - case SIOCSIFHWADDR: - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); - - case SIOCSIFHWBROADCAST: - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return 0; - - case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; - - case SIOCADDMULTI: - if (!ops->ndo_set_rx_mode || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); - - case SIOCDELMULTI: - if (!ops->ndo_set_rx_mode || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); - - case SIOCSIFTXQLEN: - if (ifr->ifr_qlen < 0) - return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; - return 0; - - case SIOCSIFNAME: - ifr->ifr_newname[IFNAMSIZ-1] = '\0'; - return dev_change_name(dev, ifr->ifr_newname); - - case SIOCSHWTSTAMP: - err = net_hwtstamp_validate(ifr); - if (err) - return err; - /* fall through */ - - /* - * Unknown or private ioctl - */ - default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || - cmd == SIOCBONDRELEASE || - cmd == SIOCBONDSETHWADDR || - cmd == SIOCBONDSLAVEINFOQUERY || - cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCSHWTSTAMP || - cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (ops->ndo_do_ioctl) { - if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); - else - err = -ENODEV; - } - } else - err = -EINVAL; - - } - return err; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_carrier(dev, new_carrier); } - -/* - * This function handles all "interface"-type I/O control requests. The actual - * 'doing' part of this is dev_ifsioc above. - */ +EXPORT_SYMBOL(dev_change_carrier); /** - * dev_ioctl - network device ioctl - * @net: the applicable net namespace - * @cmd: command to issue - * @arg: pointer to a struct ifreq in user space + * dev_get_phys_port_id - Get device physical port ID + * @dev: device + * @ppid: port ID * - * Issue ioctl functions to devices. This is normally called by the - * user space syscall interfaces but can sometimes be useful for - * other purposes. The return value is the return from the syscall if - * positive or a negative errno code on error. + * Get device physical port ID */ - -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid) { - struct ifreq ifr; - int ret; - char *colon; - - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_lock(); - ret = dev_ifconf(net, (char __user *) arg); - rtnl_unlock(); - return ret; - } - if (cmd == SIOCGIFNAME) - return dev_ifname(net, (struct ifreq __user *)arg); - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - ifr.ifr_name[IFNAMSIZ-1] = 0; - - colon = strchr(ifr.ifr_name, ':'); - if (colon) - *colon = 0; - - /* - * See which interface the caller is talking about. - */ - - switch (cmd) { - /* - * These ioctl calls: - * - can be done by all. - * - atomic and do not require locking. - * - return a value - */ - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFHWADDR: - case SIOCGIFSLAVE: - case SIOCGIFMAP: - case SIOCGIFINDEX: - case SIOCGIFTXQLEN: - dev_load(net, ifr.ifr_name); - rcu_read_lock(); - ret = dev_ifsioc_locked(net, &ifr, cmd); - rcu_read_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - case SIOCETHTOOL: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ethtool(net, &ifr); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - return a value - */ - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCSIFNAME: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - do not return a value - */ - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: - case SIOCSIFMTU: - case SIOCSIFMAP: - case SIOCSIFHWADDR: - case SIOCSIFSLAVE: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCSIFHWBROADCAST: - case SIOCSIFTXQLEN: - case SIOCSMIIREG: - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: - case SIOCSHWTSTAMP: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - /* fall through */ - case SIOCBONDSLAVEINFOQUERY: - case SIOCBONDINFOQUERY: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - return ret; - - case SIOCGIFMEM: - /* Get the per device memory space. We can add this but - * currently do not support it */ - case SIOCSIFMEM: - /* Set the per device memory buffer space. - * Not applicable in our case */ - case SIOCSIFLINK: - return -ENOTTY; + const struct net_device_ops *ops = dev->netdev_ops; - /* - * Unknown or private ioctl. - */ - default: - if (cmd == SIOCWANDEV || - (cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } - /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) - return wext_handle_ioctl(net, &ifr, cmd, arg); - return -ENOTTY; - } + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); } - +EXPORT_SYMBOL(dev_get_phys_port_id); /** * dev_new_index - allocate an ifindex @@ -5224,21 +5237,23 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) */ static int dev_new_index(struct net *net) { - static int ifindex; + int ifindex = net->ifindex; for (;;) { if (++ifindex <= 0) ifindex = 1; if (!__dev_get_by_index(net, ifindex)) - return ifindex; + return net->ifindex = ifindex; } } /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); +static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); + dev_net(dev)->dev_unreg_count++; } static void rollback_registered_many(struct list_head *head) @@ -5300,17 +5315,17 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); - /* Notifier chain MUST detach us from master device. */ - WARN_ON(dev->master); + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif } - /* Process any work delayed until the end of the batch */ - dev = list_first_entry(head, struct net_device, unreg_list); - call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - synchronize_net(); list_for_each_entry(dev, head, unreg_list) @@ -5336,20 +5351,25 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } - /* Fix illegal SG+CSUM combinations. */ - if ((features & NETIF_F_SG) && - !(features & NETIF_F_ALL_CSUM)) { - netdev_dbg(dev, - "Dropping NETIF_F_SG since no checksum feature.\n"); - features &= ~NETIF_F_SG; - } - /* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); features &= ~NETIF_F_ALL_TSO; } + if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IP_CSUM)) { + netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO; + features &= ~NETIF_F_TSO_ECN; + } + + if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IPV6_CSUM)) { + netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO6; + } + /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; @@ -5486,10 +5506,9 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) { - pr_err("netdev: Unable to allocate %u rx queues\n", count); + if (!rx) return -ENOMEM; - } + dev->_rx = rx; for (i = 0; i < count; i++) @@ -5512,17 +5531,27 @@ static void netdev_init_one_queue(struct net_device *dev, #endif } +static void netif_free_tx_queues(struct net_device *dev) +{ + if (is_vmalloc_addr(dev->_tx)) + vfree(dev->_tx); + else + kfree(dev->_tx); +} + static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; + size_t sz = count * sizeof(*tx); - BUG_ON(count < 1); + BUG_ON(count < 1 || count > 0xffff); - tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); + tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!tx) { - pr_err("netdev: Unable to allocate %u tx queues\n", count); - return -ENOMEM; + tx = vzalloc(sz); + if (!tx) + return -ENOMEM; } dev->_tx = tx; @@ -5568,7 +5597,7 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; - ret = dev_get_valid_name(dev, dev->name); + ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; @@ -5582,7 +5611,21 @@ int register_netdevice(struct net_device *dev) } } - dev->ifindex = dev_new_index(net); + if (((dev->hw_features | dev->features) & + NETIF_F_HW_VLAN_CTAG_FILTER) && + (!dev->netdev_ops->ndo_vlan_rx_add_vid || + !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { + netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); + ret = -EINVAL; + goto err_uninit; + } + + ret = -EBUSY; + if (!dev->ifindex) + dev->ifindex = dev_new_index(net); + else if (__dev_get_by_index(net, dev->ifindex)) + goto err_uninit; + if (dev->iflink == -1) dev->iflink = dev->ifindex; @@ -5606,6 +5649,14 @@ int register_netdevice(struct net_device *dev) */ dev->vlan_features |= NETIF_F_HIGHDMA; + /* Make NETIF_F_SG inheritable to tunnel devices. + */ + dev->hw_enc_features |= NETIF_F_SG; + + /* Make NETIF_F_SG inheritable to MPLS. + */ + dev->mpls_features |= NETIF_F_SG; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) @@ -5625,11 +5676,20 @@ int register_netdevice(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); + linkwatch_init_dev(dev); + dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); add_device_randomness(dev->dev_addr, dev->addr_len); + /* If the device has permanent device address, driver should + * set dev_addr and also addr_assign_type should be set to + * NET_ADDR_PERM (default value). + */ + if (dev->addr_assign_type == NET_ADDR_PERM) + memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); + /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); @@ -5732,6 +5792,7 @@ EXPORT_SYMBOL(netdev_refcnt_read); /** * netdev_wait_allrefs - wait until all references are gone. + * @dev: target net_device * * This is called when unregistering network devices. * @@ -5757,9 +5818,12 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users - * should have already handle it the first time */ + __rtnl_unlock(); + rcu_barrier(); + rtnl_lock(); + + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events @@ -5821,9 +5885,8 @@ void netdev_run_todo(void) __rtnl_unlock(); - /* Wait for rcu callbacks to finish before attempting to drain - * the device list. This usually avoids a 250ms wait. - */ + + /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); @@ -5832,6 +5895,10 @@ void netdev_run_todo(void) = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); + rtnl_lock(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + __rtnl_unlock(); + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", dev->name, dev->reg_state); @@ -5854,6 +5921,12 @@ void netdev_run_todo(void) if (dev->destructor) dev->destructor(dev); + /* Report a network device has been unregistered */ + rtnl_lock(); + dev_net(dev)->dev_unreg_count--; + __rtnl_unlock(); + wake_up(&netdev_unregistering_wq); + /* Free network device */ kobject_put(&dev->dev.kobj); } @@ -5927,6 +6000,16 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) return queue; } +static const struct ethtool_ops default_ethtool_ops; + +void netdev_set_default_ethtool_ops(struct net_device *dev, + const struct ethtool_ops *ops) +{ + if (dev->ethtool_ops == &default_ethtool_ops) + dev->ethtool_ops = ops; +} +EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); + /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -5971,10 +6054,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, alloc_size += NETDEV_ALIGN - 1; p = kzalloc(alloc_size, GFP_KERNEL); - if (!p) { - pr_err("alloc_netdev: Unable to allocate device\n"); + if (!p) return NULL; - } dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; @@ -5997,6 +6078,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); + INIT_LIST_HEAD(&dev->upper_dev_list); + INIT_LIST_HEAD(&dev->lower_dev_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); @@ -6014,6 +6097,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, strcpy(dev->name, name); dev->group = INIT_NETDEV_GROUP; + if (!dev->ethtool_ops) + dev->ethtool_ops = &default_ethtool_ops; return dev; free_all: @@ -6022,7 +6107,7 @@ free_all: free_pcpu: free_percpu(dev->pcpu_refcnt); - kfree(dev->_tx); + netif_free_tx_queues(dev); #ifdef CONFIG_RPS kfree(dev->_rx); #endif @@ -6047,7 +6132,7 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); - kfree(dev->_tx); + netif_free_tx_queues(dev); #ifdef CONFIG_RPS kfree(dev->_rx); #endif @@ -6181,7 +6266,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char goto out; /* Ensure the device has been registrered */ - err = -EINVAL; if (dev->reg_state != NETREG_REGISTERED) goto out; @@ -6198,7 +6282,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* We get here if we can't use the current device name */ if (!pat) goto out; - if (dev_get_valid_name(dev, pat) < 0) + if (dev_get_valid_name(net, dev, pat) < 0) goto out; } @@ -6226,7 +6310,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); + rcu_barrier(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* @@ -6235,6 +6320,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev_uc_flush(dev); dev_mc_flush(dev); + /* Send a netdev-removed uevent to the old namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); + /* Actually switch the network namespace */ dev_net_set(dev, net); @@ -6246,6 +6334,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev->iflink = dev->ifindex; } + /* Send a netdev-add uevent to the new namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + /* Fixup kobjects */ err = device_rename(&dev->dev, dev->name); WARN_ON(err); @@ -6352,7 +6443,7 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); -static struct hlist_head *netdev_create_hash(void) +static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; @@ -6409,22 +6500,26 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -int __netdev_printk(const char *level, const struct net_device *dev, +static int __netdev_printk(const char *level, const struct net_device *dev, struct va_format *vaf) { int r; - if (dev && dev->dev.parent) - r = dev_printk(level, dev->dev.parent, "%s: %pV", - netdev_name(dev), vaf); - else if (dev) + if (dev && dev->dev.parent) { + r = dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), vaf); + } else if (dev) { r = printk("%s%s: %pV", level, netdev_name(dev), vaf); - else + } else { r = printk("%s(NULL net_device): %pV", level, vaf); + } return r; } -EXPORT_SYMBOL(__netdev_printk); int netdev_printk(const char *level, const struct net_device *dev, const char *format, ...) @@ -6439,6 +6534,7 @@ int netdev_printk(const char *level, const struct net_device *dev, vaf.va = &args; r = __netdev_printk(level, dev, &vaf); + va_end(args); return r; @@ -6458,6 +6554,7 @@ int func(const struct net_device *dev, const char *fmt, ...) \ vaf.va = &args; \ \ r = __netdev_printk(level, dev, &vaf); \ + \ va_end(args); \ \ return r; \ @@ -6515,6 +6612,34 @@ static void __net_exit default_device_exit(struct net *net) rtnl_unlock(); } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ + /* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace in net_list. + */ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network @@ -6526,7 +6651,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - rtnl_lock(); + /* To prevent network device cleanup code from dereferencing + * loopback devices or network devices that have been freed + * wait here for all pending unregistrations to complete, + * before unregistring the loopback device and allowing the + * network namespace be freed. + * + * The netdev todo list containing all network devices + * unregistrations that happen in default_device_exit_batch + * will run in the rtnl_unlock() at the end of + * default_device_exit_batch. + */ + rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops) @@ -6572,6 +6708,8 @@ static int __init net_dev_init(void) for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); + INIT_LIST_HEAD(&offload_base); + if (register_pernet_subsys(&netdev_net_ops)) goto out; @@ -6600,6 +6738,10 @@ static int __init net_dev_init(void) sd->backlog.weight = weight_p; sd->backlog.gro_list = NULL; sd->backlog.gro_count = 0; + +#ifdef CONFIG_NET_FLOW_LIMIT + sd->flow_limit = NULL; +#endif } dev_boot_phase = 0; @@ -6624,19 +6766,9 @@ static int __init net_dev_init(void) hotcpu_notifier(dev_cpu_callback, 0); dst_init(); - dev_mcast_init(); rc = 0; out: return rc; } subsys_initcall(net_dev_init); - -static int __init initialize_hashrnd(void) -{ - get_random_bytes(&hashrnd, sizeof(hashrnd)); - return 0; -} - -late_initcall_sync(initialize_hashrnd); - |