diff options
Diffstat (limited to 'drivers/net/vxlan/vxlan_core.c')
| -rw-r--r-- | drivers/net/vxlan/vxlan_core.c | 1617 |
1 files changed, 919 insertions, 698 deletions
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index c9a9373733c0..e957aa12a8a4 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -15,6 +15,7 @@ #include <linux/igmp.h> #include <linux/if_ether.h> #include <linux/ethtool.h> +#include <linux/rhashtable.h> #include <net/arp.h> #include <net/ndisc.h> #include <net/gro.h> @@ -25,6 +26,7 @@ #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/tun_proto.h> #include <net/vxlan.h> #include <net/nexthop.h> @@ -62,8 +64,12 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); -/* salt for hash table */ -static u32 vxlan_salt __read_mostly; +static const struct rhashtable_params vxlan_fdb_rht_params = { + .head_offset = offsetof(struct vxlan_fdb, rhnode), + .key_offset = offsetof(struct vxlan_fdb, key), + .key_len = sizeof(struct vxlan_fdb_key), + .automatic_shrinking = true, +}; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { @@ -185,7 +191,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, } else if (nh) { ndm->ndm_family = nh_family; } - send_eth = !is_zero_ether_addr(fdb->eth_addr); + send_eth = !is_zero_ether_addr(fdb->key.eth_addr); } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; @@ -200,7 +206,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, peernet2id(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; - if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) + if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.eth_addr)) goto nla_put_failure; if (nh) { if (nla_put_u32(skb, NDA_NH_ID, nh_id)) @@ -222,14 +228,14 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, goto nla_put_failure; } - if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && + if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->key.vni && nla_put_u32(skb, NDA_SRC_VNI, - be32_to_cpu(fdb->vni))) + be32_to_cpu(fdb->key.vni))) goto nla_put_failure; - ci.ndm_used = jiffies_to_clock_t(now - fdb->used); + ci.ndm_used = jiffies_to_clock_t(now - READ_ONCE(fdb->used)); ci.ndm_confirmed = 0; - ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); + ci.ndm_updated = jiffies_to_clock_t(now - READ_ONCE(fdb->updated)); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) @@ -277,8 +283,7 @@ static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, @@ -293,8 +298,8 @@ static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, fdb_info->remote_port = rd->remote_port; fdb_info->remote_vni = rd->remote_vni; fdb_info->remote_ifindex = rd->remote_ifindex; - memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN); - fdb_info->vni = fdb->vni; + memcpy(fdb_info->eth_addr, fdb->key.eth_addr, ETH_ALEN); + fdb_info->vni = fdb->key.vni; fdb_info->offloaded = rd->offloaded; fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER; } @@ -366,67 +371,42 @@ static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) }; struct vxlan_rdst remote = { }; - memcpy(f.eth_addr, eth_addr, ETH_ALEN); + memcpy(f.key.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } -/* Hash Ethernet address */ -static u32 eth_hash(const unsigned char *addr) -{ - u64 value = get_unaligned((u64 *)addr); - - /* only want 6 bytes */ -#ifdef __BIG_ENDIAN - value >>= 16; -#else - value <<= 16; -#endif - return hash_64(value, FDB_HASH_BITS); -} - -u32 eth_vni_hash(const unsigned char *addr, __be32 vni) +/* Look up Ethernet address in forwarding table */ +static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) { - /* use 1 byte of OUI and 3 bytes of NIC */ - u32 key = get_unaligned((u32 *)(addr + 2)); + struct vxlan_fdb_key key; - return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); -} - -u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) -{ - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) - return eth_vni_hash(mac, vni); + memset(&key, 0, sizeof(key)); + memcpy(key.eth_addr, mac, sizeof(key.eth_addr)); + if (!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) + key.vni = vxlan->default_dst.remote_vni; else - return eth_hash(mac); -} + key.vni = vni; -/* Hash chain to use given mac address */ -static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) -{ - return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)]; + return rhashtable_lookup(&vxlan->fdb_hash_tbl, &key, + vxlan_fdb_rht_params); } -/* Look up Ethernet address in forwarding table */ -static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) +static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) { - struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); struct vxlan_fdb *f; - hlist_for_each_entry_rcu(f, head, hlist) { - if (ether_addr_equal(mac, f->eth_addr)) { - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { - if (vni == f->vni) - return f; - } else { - return f; - } - } + f = vxlan_find_mac_rcu(vxlan, mac, vni); + if (f) { + unsigned long now = jiffies; + + if (READ_ONCE(f->used) != now) + WRITE_ONCE(f->used, now); } - return NULL; + return f; } static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, @@ -434,9 +414,11 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, { struct vxlan_fdb *f; - f = __vxlan_find_mac(vxlan, mac, vni); - if (f && f->used != jiffies) - f->used = jiffies; + lockdep_assert_held_once(&vxlan->hash_lock); + + rcu_read_lock(); + f = vxlan_find_mac_rcu(vxlan, mac, vni); + rcu_read_unlock(); return f; } @@ -464,7 +446,7 @@ int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, { struct vxlan_dev *vxlan = netdev_priv(dev); u8 eth_addr[ETH_ALEN + 2] = { 0 }; - struct vxlan_rdst *rdst; + struct vxlan_rdst *rdst = NULL; struct vxlan_fdb *f; int rc = 0; @@ -476,13 +458,14 @@ int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, rcu_read_lock(); - f = __vxlan_find_mac(vxlan, eth_addr, vni); - if (!f) { + f = vxlan_find_mac_rcu(vxlan, eth_addr, vni); + if (f) + rdst = first_remote_rcu(f); + if (!rdst) { rc = -ENOENT; goto out; } - rdst = first_remote_rcu(f); vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info); out: @@ -513,32 +496,28 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; - unsigned int h; int rc = 0; if (!netif_is_vxlan(dev)) return -EINVAL; vxlan = netdev_priv(dev); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); - hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { - if (f->vni == vni) { - list_for_each_entry(rdst, &f->remotes, list) { - rc = vxlan_fdb_notify_one(nb, vxlan, - f, rdst, - extack); - if (rc) - goto unlock; - } + spin_lock_bh(&vxlan->hash_lock); + hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { + if (f->key.vni == vni) { + list_for_each_entry(rdst, &f->remotes, list) { + rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, + extack); + if (rc) + goto unlock; } } - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); return 0; unlock: - spin_unlock_bh(&vxlan->hash_lock[h]); + spin_unlock_bh(&vxlan->hash_lock); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_replay); @@ -548,20 +527,19 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; - unsigned int h; if (!netif_is_vxlan(dev)) return; vxlan = netdev_priv(dev); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); - hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) - if (f->vni == vni) - list_for_each_entry(rdst, &f->remotes, list) - rdst->offloaded = false; - spin_unlock_bh(&vxlan->hash_lock[h]); + spin_lock_bh(&vxlan->hash_lock); + hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { + if (f->key.vni == vni) { + list_for_each_entry(rdst, &f->remotes, list) + rdst->offloaded = false; + } } + spin_unlock_bh(&vxlan->hash_lock); } EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); @@ -606,10 +584,10 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, if (rd == NULL) return -ENOMEM; - if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { - kfree(rd); - return -ENOMEM; - } + /* The driver can work correctly without a dst cache, so do not treat + * dst cache initialization errors as fatal. + */ + dst_cache_init(&rd->dst_cache, GFP_ATOMIC | __GFP_NOWARN); rd->remote_ip = *ip; rd->remote_port = port; @@ -623,9 +601,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, return 1; } -static bool vxlan_parse_gpe_proto(struct vxlanhdr *hdr, __be16 *protocol) +static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol) { - struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)hdr; + const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr; /* Need to have Next Protocol set for interfaces in GPE mode. */ if (!gpe->np_applied) @@ -799,27 +777,20 @@ static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return NULL; + memset(&f->key, 0, sizeof(f->key)); f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; - f->vni = src_vni; + f->key.vni = src_vni; f->nh = NULL; RCU_INIT_POINTER(f->vdev, vxlan); INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); - memcpy(f->eth_addr, mac, ETH_ALEN); + memcpy(f->key.eth_addr, mac, ETH_ALEN); return f; } -static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, - __be32 src_vni, struct vxlan_fdb *f) -{ - ++vxlan->addrcnt; - hlist_add_head_rcu(&f->hlist, - vxlan_fdb_head(vxlan, mac, src_vni)); -} - static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, u32 nhid, struct netlink_ext_ack *extack) { @@ -909,10 +880,27 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, if (rc < 0) goto errout; + rc = rhashtable_lookup_insert_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); + if (rc) + goto destroy_remote; + + ++vxlan->addrcnt; + hlist_add_head_rcu(&f->fdb_node, &vxlan->fdb_list); + *fdb = f; return 0; +destroy_remote: + if (rcu_access_pointer(f->nh)) { + list_del_rcu(&f->nh_list); + nexthop_put(rtnl_dereference(f->nh)); + } else { + list_del(&rd->list); + dst_cache_destroy(&rd->dst_cache); + kfree(rd); + } errout: kfree(f); return rc; @@ -949,7 +937,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, { struct vxlan_rdst *rd; - netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); + netdev_dbg(vxlan->dev, "delete %pM\n", f->key.eth_addr); --vxlan->addrcnt; if (do_notify) { @@ -962,7 +950,9 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, swdev_notify, NULL); } - hlist_del_rcu(&f->hlist); + hlist_del_init_rcu(&f->fdb_node); + rhashtable_remove_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } @@ -1010,20 +1000,18 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { if (f->state != state) { f->state = state; - f->updated = jiffies; notify = 1; } if (f->flags != fdb_flags) { f->flags = fdb_flags; - f->updated = jiffies; notify = 1; } } if ((flags & NLM_F_REPLACE)) { /* Only change unicasts */ - if (!(is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { + if (!(is_multicast_ether_addr(f->key.eth_addr) || + is_zero_ether_addr(f->key.eth_addr))) { if (nhid) { rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); if (rc < 0) @@ -1039,8 +1027,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, } } if ((flags & NLM_F_APPEND) && - (is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { + (is_multicast_ether_addr(f->key.eth_addr) || + is_zero_ether_addr(f->key.eth_addr))) { rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) @@ -1049,12 +1037,13 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, } if (ndm_flags & NTF_USE) - f->used = jiffies; + WRITE_ONCE(f->updated, jiffies); if (notify) { if (rd == NULL) rd = first_remote_rtnl(f); + WRITE_ONCE(f->updated, jiffies); err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, swdev_notify, extack); if (err) @@ -1098,7 +1087,6 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, if (rc < 0) return rc; - vxlan_fdb_insert(vxlan, mac, src_vni, f); rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, swdev_notify, extack); if (rc) @@ -1122,7 +1110,7 @@ int vxlan_fdb_update(struct vxlan_dev *vxlan, { struct vxlan_fdb *f; - f = __vxlan_find_mac(vxlan, mac, src_vni); + f = vxlan_find_mac(vxlan, mac, src_vni); if (f) { if (flags & NLM_F_EXCL) { netdev_dbg(vxlan->dev, @@ -1233,10 +1221,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, *ifindex = 0; } - if (tb[NDA_NH_ID]) - *nhid = nla_get_u32(tb[NDA_NH_ID]); - else - *nhid = 0; + *nhid = nla_get_u32_default(tb[NDA_NH_ID], 0); return 0; } @@ -1245,7 +1230,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ @@ -1253,7 +1238,6 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], __be16 port; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; int err; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { @@ -1273,13 +1257,15 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) return -EAFNOSUPPORT; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, nhid, true, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); + + if (!err) + *notified = true; return err; } @@ -1320,14 +1306,13 @@ out: /* Delete entry (via netlink) */ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; __be16 port; int err; @@ -1336,11 +1321,13 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], if (err) return err; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); + + if (!err) + *notified = true; return err; } @@ -1350,53 +1337,48 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; + struct vxlan_fdb *f; int err = 0; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct vxlan_fdb *f; - - rcu_read_lock(); - hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { - struct vxlan_rdst *rd; - - if (rcu_access_pointer(f->nh)) { - if (*idx < cb->args[2]) - goto skip_nh; - err = vxlan_fdb_info(skb, vxlan, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, NULL); - if (err < 0) { - rcu_read_unlock(); - goto out; - } -skip_nh: - *idx += 1; - continue; + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { + struct vxlan_rdst *rd; + + if (rcu_access_pointer(f->nh)) { + if (*idx < ctx->fdb_idx) + goto skip_nh; + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, NULL); + if (err < 0) { + rcu_read_unlock(); + goto out; } +skip_nh: + *idx += 1; + continue; + } - list_for_each_entry_rcu(rd, &f->remotes, list) { - if (*idx < cb->args[2]) - goto skip; - - err = vxlan_fdb_info(skb, vxlan, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, rd); - if (err < 0) { - rcu_read_unlock(); - goto out; - } -skip: - *idx += 1; + list_for_each_entry_rcu(rd, &f->remotes, list) { + if (*idx < ctx->fdb_idx) + goto skip; + + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, rd); + if (err < 0) { + rcu_read_unlock(); + goto out; } +skip: + *idx += 1; } - rcu_read_unlock(); } + rcu_read_unlock(); out: return err; } @@ -1420,7 +1402,7 @@ static int vxlan_fdb_get(struct sk_buff *skb, rcu_read_lock(); - f = __vxlan_find_mac(vxlan, addr, vni); + f = vxlan_find_mac_rcu(vxlan, addr, vni); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; @@ -1436,37 +1418,45 @@ errout: /* Watch incoming packets to learn mapping between Ethernet address * and Tunnel endpoint. - * Return true if packet is bogus and should be dropped. */ -static bool vxlan_snoop(struct net_device *dev, - union vxlan_addr *src_ip, const u8 *src_mac, - u32 src_ifindex, __be32 vni) +static enum skb_drop_reason vxlan_snoop(struct net_device *dev, + union vxlan_addr *src_ip, + const u8 *src_mac, u32 src_ifindex, + __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; u32 ifindex = 0; + /* Ignore packets from invalid src-address */ + if (!is_valid_ether_addr(src_mac)) + return SKB_DROP_REASON_MAC_INVALID_SOURCE; + #if IS_ENABLED(CONFIG_IPV6) if (src_ip->sa.sa_family == AF_INET6 && (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)) ifindex = src_ifindex; #endif - f = vxlan_find_mac(vxlan, src_mac, vni); + f = vxlan_find_mac_rcu(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); + unsigned long now = jiffies; + + if (READ_ONCE(f->updated) != now) + WRITE_ONCE(f->updated, now); + + /* Don't override an fdb with nexthop with a learnt entry */ + if (rcu_access_pointer(f->nh)) + return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && rdst->remote_ifindex == ifindex)) - return false; + return SKB_NOT_DROPPED_YET; /* Don't migrate static entries, drop packets */ if (f->state & (NUD_PERMANENT | NUD_NOARP)) - return true; - - /* Don't override an fdb with nexthop with a learnt entry */ - if (rcu_access_pointer(f->nh)) - return true; + return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; if (net_ratelimit()) netdev_info(dev, @@ -1474,13 +1464,10 @@ static bool vxlan_snoop(struct net_device *dev, src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; - f->updated = jiffies; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { - u32 hash_index = fdb_head_index(vxlan, src_mac, vni); - /* learned new entry */ - spin_lock(&vxlan->hash_lock[hash_index]); + spin_lock(&vxlan->hash_lock); /* close off race between vxlan_flush and incoming packets */ if (netif_running(dev)) @@ -1491,29 +1478,26 @@ static bool vxlan_snoop(struct net_device *dev, vni, vxlan->default_dst.remote_vni, ifindex, NTF_SELF, 0, true, NULL); - spin_unlock(&vxlan->hash_lock[hash_index]); + spin_unlock(&vxlan->hash_lock); } - return false; + return SKB_NOT_DROPPED_YET; } static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) { - struct vxlan_net *vn; + ASSERT_RTNL(); if (!vs) return false; if (!refcount_dec_and_test(&vs->refcnt)) return false; - vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); - spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); udp_tunnel_notify_del_rx_port(vs->sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); - spin_unlock(&vn->sock_lock); return true; } @@ -1548,43 +1532,45 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan) #endif } -static bool vxlan_remcsum(struct vxlanhdr *unparsed, - struct sk_buff *skb, u32 vxflags) +static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags) { + const struct vxlanhdr *vh = vxlan_hdr(skb); + enum skb_drop_reason reason; size_t start, offset; - if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) - goto out; + if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) + return SKB_NOT_DROPPED_YET; - start = vxlan_rco_start(unparsed->vx_vni); - offset = start + vxlan_rco_offset(unparsed->vx_vni); + start = vxlan_rco_start(vh->vx_vni); + offset = start + vxlan_rco_offset(vh->vx_vni); - if (!pskb_may_pull(skb, offset + sizeof(u16))) - return false; + reason = pskb_may_pull_reason(skb, offset + sizeof(u16)); + if (reason) + return reason; skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); -out: - unparsed->vx_flags &= ~VXLAN_HF_RCO; - unparsed->vx_vni &= VXLAN_VNI_MASK; - return true; + return SKB_NOT_DROPPED_YET; } -static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, - struct sk_buff *skb, u32 vxflags, +static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags, struct vxlan_metadata *md) { - struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; + const struct vxlanhdr *vh = vxlan_hdr(skb); + const struct vxlanhdr_gbp *gbp; struct metadata_dst *tun_dst; - if (!(unparsed->vx_flags & VXLAN_HF_GBP)) - goto out; + gbp = (const struct vxlanhdr_gbp *)vh; + + if (!(vh->vx_flags & VXLAN_HF_GBP)) + return; md->gbp = ntohs(gbp->policy_id); tun_dst = (struct metadata_dst *)skb_dst(skb); if (tun_dst) { - tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, + tun_dst->u.tun_info.key.tun_flags); tun_dst->u.tun_info.options_len = sizeof(*md); } if (gbp->dont_learn) @@ -1596,13 +1582,11 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, /* In flow-based mode, GBP is carried in dst_metadata */ if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; -out: - unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; } -static bool vxlan_set_mac(struct vxlan_dev *vxlan, - struct vxlan_sock *vs, - struct sk_buff *skb, __be32 vni) +static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan, + struct vxlan_sock *vs, + struct sk_buff *skb, __be32 vni) { union vxlan_addr saddr; u32 ifindex = skb->dev->ifindex; @@ -1613,7 +1597,7 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan, /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) - return false; + return SKB_DROP_REASON_LOCAL_MAC; /* Get address from the outer IP header */ if (vxlan_get_sk_family(vs) == AF_INET) { @@ -1626,11 +1610,11 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan, #endif } - if ((vxlan->cfg.flags & VXLAN_F_LEARN) && - vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni)) - return false; + if (!(vxlan->cfg.flags & VXLAN_F_LEARN)) + return SKB_NOT_DROPPED_YET; - return true; + return vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, + ifindex, vni); } static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, @@ -1657,72 +1641,96 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, return err <= 1; } -/* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { struct vxlan_vni_node *vninode = NULL; + const struct vxlanhdr *vh; struct vxlan_dev *vxlan; struct vxlan_sock *vs; - struct vxlanhdr unparsed; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); + enum skb_drop_reason reason; bool raw_proto = false; void *oiph; __be32 vni = 0; + int nh; /* Need UDP and VXLAN header to be present */ - if (!pskb_may_pull(skb, VXLAN_HLEN)) + reason = pskb_may_pull_reason(skb, VXLAN_HLEN); + if (reason) goto drop; - unparsed = *vxlan_hdr(skb); + vh = vxlan_hdr(skb); /* VNI flag always required to be set */ - if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { + if (!(vh->vx_flags & VXLAN_HF_VNI)) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxlan_hdr(skb)->vx_flags), - ntohl(vxlan_hdr(skb)->vx_vni)); + ntohl(vh->vx_flags), ntohl(vh->vx_vni)); + reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; /* Return non vxlan pkt */ goto drop; } - unparsed.vx_flags &= ~VXLAN_HF_VNI; - unparsed.vx_vni &= ~VXLAN_VNI_MASK; vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; - vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); + vni = vxlan_vni(vh->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); - if (!vxlan) + if (!vxlan) { + reason = SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND; goto drop; + } - /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ - if (vs->flags & VXLAN_F_GPE) { - if (!vxlan_parse_gpe_proto(&unparsed, &protocol)) + if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags || + vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) { + /* If the header uses bits besides those enabled by the + * netdevice configuration, treat this as a malformed packet. + * This behavior diverges from VXLAN RFC (RFC7348) which + * stipulates that bits in reserved in reserved fields are to be + * ignored. The approach here maintains compatibility with + * previous stack code, and also is more robust and provides a + * little more security in adding extensions to VXLAN. + */ + reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; + DEV_STATS_INC(vxlan->dev, rx_frame_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_ERRORS, 0); + goto drop; + } + + if (vxlan->cfg.flags & VXLAN_F_GPE) { + if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; - unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; } if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, - !net_eq(vxlan->net, dev_net(vxlan->dev)))) + !net_eq(vxlan->net, dev_net(vxlan->dev)))) { + reason = SKB_DROP_REASON_NOMEM; goto drop; + } - if (vs->flags & VXLAN_F_REMCSUM_RX) - if (unlikely(!vxlan_remcsum(&unparsed, skb, vs->flags))) + if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) { + reason = vxlan_remcsum(skb, vxlan->cfg.flags); + if (unlikely(reason)) goto drop; + } if (vxlan_collect_metadata(vs)) { + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst; - tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY, + __set_bit(IP_TUNNEL_KEY_BIT, flags); + tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), flags, key32_to_tunnel_id(vni), sizeof(*md)); - if (!tun_dst) + if (!tun_dst) { + reason = SKB_DROP_REASON_NOMEM; goto drop; + } md = ip_tunnel_info_opts(&tun_dst->u.tun_info); @@ -1731,26 +1739,15 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - if (vs->flags & VXLAN_F_GBP) - vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); + if (vxlan->cfg.flags & VXLAN_F_GBP) + vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md); /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ - if (unparsed.vx_flags || unparsed.vx_vni) { - /* If there are any unprocessed flags remaining treat - * this as a malformed packet. This behavior diverges from - * VXLAN RFC (RFC7348) which stipulates that bits in reserved - * in reserved fields are to be ignored. The approach here - * maintains compatibility with previous stack code, and also - * is more robust and provides a little more security in - * adding extensions to VXLAN. - */ - goto drop; - } - if (!raw_proto) { - if (!vxlan_set_mac(vxlan, vs, skb, vni)) + reason = vxlan_set_mac(vxlan, vs, skb, vni); + if (reason) goto drop; } else { skb_reset_mac_header(skb); @@ -1758,12 +1755,30 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; } - oiph = skb_network_header(skb); + /* Save offset of outer header relative to skb->head, + * because we are going to reset the network header to the inner header + * and might change skb->head. + */ + nh = skb_network_header(skb) - skb->head; + skb_reset_network_header(skb); + reason = pskb_inet_may_pull_reason(skb); + if (reason) { + DEV_STATS_INC(vxlan->dev, rx_length_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_ERRORS, 0); + goto drop; + } + + /* Get the outer header. */ + oiph = skb->head + nh; + if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { - ++vxlan->dev->stats.rx_frame_errors; - ++vxlan->dev->stats.rx_errors; + reason = SKB_DROP_REASON_IP_TUNNEL_ECN; + DEV_STATS_INC(vxlan->dev, rx_frame_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; @@ -1773,13 +1788,14 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); - dev_core_stats_rx_dropped_inc(vxlan->dev); + dev_dstats_rx_dropped(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); + reason = SKB_DROP_REASON_DEV_READY; goto drop; } - dev_sw_netstats_rx_add(vxlan->dev, skb->len); + dev_dstats_rx_add(vxlan->dev, skb->len); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len); gro_cells_receive(&vxlan->gro_cells, skb); @@ -1788,12 +1804,12 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) return 0; drop: + reason = reason ?: SKB_DROP_REASON_NOT_SPECIFIED; /* Consume bad packet */ - kfree_skb(skb); + kfree_skb_reason(skb, reason); return 0; } -/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */ static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb) { struct vxlan_dev *vxlan; @@ -1833,7 +1849,9 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (!pskb_may_pull(skb, arp_hdr_len(dev))) { - dev->stats.tx_dropped++; + dev_dstats_tx_dropped(dev); + vxlan_vnifilter_count(vxlan, vni, NULL, + VXLAN_VNI_STATS_TX_DROPS, 0); goto out; } parp = arp_hdr(skb); @@ -1860,6 +1878,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) n = neigh_lookup(&arp_tbl, &tip, dev); if (n) { + struct vxlan_rdst *rdst = NULL; struct vxlan_fdb *f; struct sk_buff *reply; @@ -1868,12 +1887,17 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; } - f = vxlan_find_mac(vxlan, n->ha, vni); - if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { + rcu_read_lock(); + f = vxlan_find_mac_tx(vxlan, n->ha, vni); + if (f) + rdst = first_remote_rcu(f); + if (rdst && vxlan_addr_any(&rdst->remote_ip)) { /* bridge-local neighbor */ neigh_release(n); + rcu_read_unlock(); goto out; } + rcu_read_unlock(); reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, n->ha, sha); @@ -1889,7 +1913,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) reply->pkt_type = PACKET_HOST; if (netif_rx(reply) == NET_RX_DROP) { - dev->stats.rx_dropped++; + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2024,6 +2048,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev); if (n) { + struct vxlan_rdst *rdst = NULL; struct vxlan_fdb *f; struct sk_buff *reply; @@ -2032,8 +2057,10 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; } - f = vxlan_find_mac(vxlan, n->ha, vni); - if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { + f = vxlan_find_mac_tx(vxlan, n->ha, vni); + if (f) + rdst = first_remote_rcu(f); + if (rdst && vxlan_addr_any(&rdst->remote_ip)) { /* bridge-local neighbor */ neigh_release(n); goto out; @@ -2048,7 +2075,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (netif_rx(reply) == NET_RX_DROP) { - dev->stats.rx_dropped++; + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2215,127 +2242,16 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, return 0; } -static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev, - struct vxlan_sock *sock4, - struct sk_buff *skb, int oif, u8 tos, - __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport, - __u8 flow_flags, struct dst_cache *dst_cache, - const struct ip_tunnel_info *info) -{ - bool use_cache = ip_tunnel_dst_cache_usable(skb, info); - struct rtable *rt = NULL; - struct flowi4 fl4; - - if (!sock4) - return ERR_PTR(-EIO); - - if (tos && !info) - use_cache = false; - if (use_cache) { - rt = dst_cache_get_ip4(dst_cache, saddr); - if (rt) - return rt; - } - - memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = oif; - fl4.flowi4_tos = RT_TOS(tos); - fl4.flowi4_mark = skb->mark; - fl4.flowi4_proto = IPPROTO_UDP; - fl4.daddr = daddr; - fl4.saddr = *saddr; - fl4.fl4_dport = dport; - fl4.fl4_sport = sport; - fl4.flowi4_flags = flow_flags; - - rt = ip_route_output_key(vxlan->net, &fl4); - if (!IS_ERR(rt)) { - if (rt->dst.dev == dev) { - netdev_dbg(dev, "circular route to %pI4\n", &daddr); - ip_rt_put(rt); - return ERR_PTR(-ELOOP); - } - - *saddr = fl4.saddr; - if (use_cache) - dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); - } else { - netdev_dbg(dev, "no route to %pI4\n", &daddr); - return ERR_PTR(-ENETUNREACH); - } - return rt; -} - -#if IS_ENABLED(CONFIG_IPV6) -static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, - struct net_device *dev, - struct vxlan_sock *sock6, - struct sk_buff *skb, int oif, u8 tos, - __be32 label, - const struct in6_addr *daddr, - struct in6_addr *saddr, - __be16 dport, __be16 sport, - struct dst_cache *dst_cache, - const struct ip_tunnel_info *info) -{ - bool use_cache = ip_tunnel_dst_cache_usable(skb, info); - struct dst_entry *ndst; - struct flowi6 fl6; - - if (!sock6) - return ERR_PTR(-EIO); - - if (tos && !info) - use_cache = false; - if (use_cache) { - ndst = dst_cache_get_ip6(dst_cache, saddr); - if (ndst) - return ndst; - } - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = oif; - fl6.daddr = *daddr; - fl6.saddr = *saddr; - fl6.flowlabel = ip6_make_flowinfo(tos, label); - fl6.flowi6_mark = skb->mark; - fl6.flowi6_proto = IPPROTO_UDP; - fl6.fl6_dport = dport; - fl6.fl6_sport = sport; - - ndst = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk, - &fl6, NULL); - if (IS_ERR(ndst)) { - netdev_dbg(dev, "no route to %pI6\n", daddr); - return ERR_PTR(-ENETUNREACH); - } - - if (unlikely(ndst->dev == dev)) { - netdev_dbg(dev, "circular route to %pI6\n", daddr); - dst_release(ndst); - return ERR_PTR(-ELOOP); - } - - *saddr = fl6.saddr; - if (use_cache) - dst_cache_set_ip6(dst_cache, ndst, saddr); - return ndst; -} -#endif - /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, struct vxlan_dev *dst_vxlan, __be32 vni, bool snoop) { - struct pcpu_sw_netstats *tx_stats, *rx_stats; union vxlan_addr loopback; union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; + unsigned int len = skb->len; struct net_device *dev; - int len = skb->len; - tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); - rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb->dev = dst_vxlan->dev; @@ -2354,29 +2270,23 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, rcu_read_lock(); dev = skb->dev; if (unlikely(!(dev->flags & IFF_UP))) { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY); goto drop; } if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop) vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni); - u64_stats_update_begin(&tx_stats->syncp); - u64_stats_inc(&tx_stats->tx_packets); - u64_stats_add(&tx_stats->tx_bytes, len); - u64_stats_update_end(&tx_stats->syncp); + dev_dstats_tx_add(src_vxlan->dev, len); vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len); if (__netif_rx(skb) == NET_RX_SUCCESS) { - u64_stats_update_begin(&rx_stats->syncp); - u64_stats_inc(&rx_stats->rx_packets); - u64_stats_add(&rx_stats->rx_bytes, len); - u64_stats_update_end(&rx_stats->syncp); + dev_dstats_rx_add(dst_vxlan->dev, len); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX, len); } else { drop: - dev->stats.rx_dropped++; + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2385,7 +2295,7 @@ drop: static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *vxlan, - union vxlan_addr *daddr, + int addr_family, __be16 dst_port, int dst_ifindex, __be32 vni, struct dst_entry *dst, u32 rt_flags) @@ -2405,13 +2315,13 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, dst_release(dst); dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, - daddr->sa.sa_family, dst_port, + addr_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) { - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND); return -ENOENT; } @@ -2427,31 +2337,43 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, { struct dst_cache *dst_cache; struct ip_tunnel_info *info; + struct ip_tunnel_key *pkey; + struct ip_tunnel_key key; struct vxlan_dev *vxlan = netdev_priv(dev); - const struct iphdr *old_iph = ip_hdr(skb); - union vxlan_addr *dst; - union vxlan_addr remote_ip, local_ip; + const struct iphdr *old_iph; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; unsigned int pkt_len = skb->len; __be16 src_port = 0, dst_port; struct dst_entry *ndst = NULL; - __u8 tos, ttl, flow_flags = 0; + int addr_family; + __u8 tos, ttl; int ifindex; - int err; + int err = 0; u32 flags = vxlan->cfg.flags; + bool use_cache; bool udp_sum = false; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); + enum skb_drop_reason reason; + bool no_eth_encap; __be32 vni = 0; -#if IS_ENABLED(CONFIG_IPV6) - __be32 label; -#endif + + no_eth_encap = flags & VXLAN_F_GPE && skb->protocol != htons(ETH_P_TEB); + reason = skb_vlan_inet_prepare(skb, no_eth_encap); + if (reason) + goto drop; + + reason = SKB_DROP_REASON_NOT_SPECIFIED; + old_iph = ip_hdr(skb); info = skb_tunnel_info(skb); + use_cache = ip_tunnel_dst_cache_usable(skb, info); if (rdst) { - dst = &rdst->remote_ip; - if (vxlan_addr_any(dst)) { + memset(&key, 0, sizeof(key)); + pkey = &key; + + if (vxlan_addr_any(&rdst->remote_ip)) { if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan, @@ -2461,30 +2383,50 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto drop; } + addr_family = vxlan->cfg.saddr.sa.sa_family; dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = (rdst->remote_vni) ? : default_vni; ifindex = rdst->remote_ifindex; - local_ip = vxlan->cfg.saddr; + + if (addr_family == AF_INET) { + key.u.ipv4.src = vxlan->cfg.saddr.sin.sin_addr.s_addr; + key.u.ipv4.dst = rdst->remote_ip.sin.sin_addr.s_addr; + } else { + key.u.ipv6.src = vxlan->cfg.saddr.sin6.sin6_addr; + key.u.ipv6.dst = rdst->remote_ip.sin6.sin6_addr; + } + dst_cache = &rdst->dst_cache; md->gbp = skb->mark; if (flags & VXLAN_F_TTL_INHERIT) { ttl = ip_tunnel_get_ttl(old_iph, skb); } else { ttl = vxlan->cfg.ttl; - if (!ttl && vxlan_addr_multicast(dst)) + if (!ttl && vxlan_addr_multicast(&rdst->remote_ip)) ttl = 1; } - tos = vxlan->cfg.tos; if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); + if (tos && !info) + use_cache = false; - if (dst->sa.sa_family == AF_INET) + if (addr_family == AF_INET) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); else udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); #if IS_ENABLED(CONFIG_IPV6) - label = vxlan->cfg.label; + switch (vxlan->cfg.label_policy) { + case VXLAN_LABEL_FIXED: + key.label = vxlan->cfg.label; + break; + case VXLAN_LABEL_INHERIT: + key.label = ip_tunnel_get_flowlabel(old_iph, skb); + break; + default: + DEBUG_NET_WARN_ON_ONCE(1); + goto drop; + } #endif } else { if (!info) { @@ -2492,57 +2434,56 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dev->name); goto drop; } - remote_ip.sa.sa_family = ip_tunnel_info_af(info); - if (remote_ip.sa.sa_family == AF_INET) { - remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; - local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src; - } else { - remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; - local_ip.sin6.sin6_addr = info->key.u.ipv6.src; - } - dst = &remote_ip; + pkey = &info->key; + addr_family = ip_tunnel_info_af(info); dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; - flow_flags = info->key.flow_flags; vni = tunnel_id_to_key32(info->key.tun_id); ifindex = 0; dst_cache = &info->dst_cache; - if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { if (info->options_len < sizeof(*md)) goto drop; md = ip_tunnel_info_opts(info); } ttl = info->key.ttl; tos = info->key.tos; -#if IS_ENABLED(CONFIG_IPV6) - label = info->key.label; -#endif - udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); + udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); } src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); rcu_read_lock(); - if (dst->sa.sa_family == AF_INET) { - struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); + if (addr_family == AF_INET) { + struct vxlan_sock *sock4; + u16 ipcb_flags = 0; struct rtable *rt; __be16 df = 0; + __be32 saddr; + + sock4 = rcu_dereference(vxlan->vn4_sock); + if (unlikely(!sock4)) { + reason = SKB_DROP_REASON_DEV_READY; + goto tx_error; + } if (!ifindex) ifindex = sock4->sock->sk->sk_bound_dev_if; - rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos, - dst->sin.sin_addr.s_addr, - &local_ip.sin.sin_addr.s_addr, - dst_port, src_port, flow_flags, - dst_cache, info); + rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, ifindex, + &saddr, pkey, src_port, dst_port, + tos, use_cache ? dst_cache : NULL); if (IS_ERR(rt)) { err = PTR_ERR(rt); + reason = SKB_DROP_REASON_IP_OUTNOROUTES; goto tx_error; } + if (flags & VXLAN_F_MC_ROUTE) + ipcb_flags |= IPSKB_MCROUTE; + if (!info) { /* Bypass encapsulation if the destination is local */ - err = encap_bypass_if_local(skb, dev, vxlan, dst, + err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, dst_port, ifindex, vni, &rt->dst, rt->rt_flags); if (err) @@ -2558,7 +2499,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, old_iph->frag_off & htons(IP_DF))) df = htons(IP_DF); } - } else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) { + } else if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + info->key.tun_flags)) { df = htons(IP_DF); } @@ -2570,16 +2512,13 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } else if (err) { if (info) { struct ip_tunnel_info *unclone; - struct in_addr src, dst; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; - src = remote_ip.sin.sin_addr; - dst = local_ip.sin.sin_addr; - unclone->key.u.ipv4.src = src.s_addr; - unclone->key.u.ipv4.dst = dst.s_addr; + unclone->key.u.ipv4.src = pkey->u.ipv4.dst; + unclone->key.u.ipv4.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); dst_release(ndst); @@ -2590,34 +2529,48 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), vni, md, flags, udp_sum); - if (err < 0) + if (err < 0) { + reason = SKB_DROP_REASON_NOMEM; goto tx_error; + } - udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr, - dst->sin.sin_addr.s_addr, tos, ttl, df, - src_port, dst_port, xnet, !udp_sum); + udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, + pkey->u.ipv4.dst, tos, ttl, df, + src_port, dst_port, xnet, !udp_sum, + ipcb_flags); #if IS_ENABLED(CONFIG_IPV6) } else { - struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); + struct vxlan_sock *sock6; + struct in6_addr saddr; + u16 ip6cb_flags = 0; + + sock6 = rcu_dereference(vxlan->vn6_sock); + if (unlikely(!sock6)) { + reason = SKB_DROP_REASON_DEV_READY; + goto tx_error; + } if (!ifindex) ifindex = sock6->sock->sk->sk_bound_dev_if; - ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos, - label, &dst->sin6.sin6_addr, - &local_ip.sin6.sin6_addr, - dst_port, src_port, - dst_cache, info); + ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, + ifindex, &saddr, pkey, + src_port, dst_port, tos, + use_cache ? dst_cache : NULL); if (IS_ERR(ndst)) { err = PTR_ERR(ndst); ndst = NULL; + reason = SKB_DROP_REASON_IP_OUTNOROUTES; goto tx_error; } + if (flags & VXLAN_F_MC_ROUTE) + ip6cb_flags |= IP6SKB_MCROUTE; + if (!info) { - u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; + u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags; - err = encap_bypass_if_local(skb, dev, vxlan, dst, + err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6, dst_port, ifindex, vni, ndst, rt6i_flags); if (err) @@ -2632,16 +2585,13 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } else if (err) { if (info) { struct ip_tunnel_info *unclone; - struct in6_addr src, dst; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; - src = remote_ip.sin6.sin6_addr; - dst = local_ip.sin6.sin6_addr; - unclone->key.u.ipv6.src = src; - unclone->key.u.ipv6.dst = dst; + unclone->key.u.ipv6.src = pkey->u.ipv6.dst; + unclone->key.u.ipv6.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); @@ -2654,13 +2604,15 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), vni, md, flags, udp_sum); - if (err < 0) + if (err < 0) { + reason = SKB_DROP_REASON_NOMEM; goto tx_error; + } udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, - &local_ip.sin6.sin6_addr, - &dst->sin6.sin6_addr, tos, ttl, - label, src_port, dst_port, !udp_sum); + &saddr, &pkey->u.ipv6.dst, tos, ttl, + pkey->label, src_port, dst_port, !udp_sum, + ip6cb_flags); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); @@ -2669,21 +2621,21 @@ out_unlock: return; drop: - dev->stats.tx_dropped++; + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); - dev_kfree_skb(skb); + kfree_skb_reason(skb, reason); return; tx_error: rcu_read_unlock(); if (err == -ELOOP) - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); dst_release(ndst); - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); - kfree_skb(skb); + kfree_skb_reason(skb, reason); } static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, @@ -2697,27 +2649,62 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); - rcu_read_lock(); nh = rcu_dereference(f->nh); - if (!nh) { + if (!nh) + goto drop; + do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); + + if (likely(do_xmit)) + vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); + else + goto drop; + + return; + +drop: + dev_dstats_tx_dropped(dev); + vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, + VXLAN_VNI_STATS_TX_DROPS, 0); + dev_kfree_skb(skb); +} + +static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev, + u32 nhid, __be32 vni) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_rdst nh_rdst; + struct nexthop *nh; + bool do_xmit; + u32 hash; + + memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); + hash = skb_get_hash(skb); + + rcu_read_lock(); + nh = nexthop_find_by_id(dev_net(dev), nhid); + if (unlikely(!nh || !nexthop_is_fdb(nh) || !nexthop_is_multipath(nh))) { rcu_read_unlock(); goto drop; } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); rcu_read_unlock(); + if (vxlan->cfg.saddr.sa.sa_family != nh_rdst.remote_ip.sa.sa_family) + goto drop; + if (likely(do_xmit)) - vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); + vxlan_xmit_one(skb, dev, vni, &nh_rdst, false); else goto drop; - return; + return NETDEV_TX_OK; drop: - dev->stats.tx_dropped++; + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); + return NETDEV_TX_OK; } /* Transmit local packets over Vxlan @@ -2731,10 +2718,11 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst, *fdst = NULL; const struct ip_tunnel_info *info; - bool did_rsc = false; struct vxlan_fdb *f; struct ethhdr *eth; __be32 vni = 0; + u32 nhid = 0; + bool did_rsc; info = skb_tunnel_info(skb); @@ -2744,11 +2732,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) if (info && info->mode & IP_TUNNEL_INFO_BRIDGE && info->mode & IP_TUNNEL_INFO_TX) { vni = tunnel_id_to_key32(info->key.tun_id); + nhid = info->key.nhid; } else { if (info && info->mode & IP_TUNNEL_INFO_TX) vxlan_xmit_one(skb, dev, vni, NULL, false); else - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TUNNEL_TXINFO); return NETDEV_TX_OK; } } @@ -2771,6 +2760,9 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) #endif } + if (nhid) + return vxlan_xmit_nhid(skb, dev, nhid, vni); + if (vxlan->cfg.flags & VXLAN_F_MDB) { struct vxlan_mdb_entry *mdb_entry; @@ -2787,7 +2779,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } eth = eth_hdr(skb); - f = vxlan_find_mac(vxlan, eth->h_dest, vni); + rcu_read_lock(); + f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); did_rsc = false; if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && @@ -2795,21 +2788,21 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) ntohs(eth->h_proto) == ETH_P_IPV6)) { did_rsc = route_shortcircuit(dev, skb); if (did_rsc) - f = vxlan_find_mac(vxlan, eth->h_dest, vni); + f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); } if (f == NULL) { - f = vxlan_find_mac(vxlan, all_zeros_mac, vni); + f = vxlan_find_mac_tx(vxlan, all_zeros_mac, vni); if (f == NULL) { if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); - dev->stats.tx_dropped++; + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); - kfree_skb(skb); - return NETDEV_TX_OK; + kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); + goto out; } } @@ -2831,75 +2824,72 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) if (fdst) vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); else - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); } +out: + rcu_read_unlock(); return NETDEV_TX_OK; } /* Walk the forwarding table and purge stale entries */ static void vxlan_cleanup(struct timer_list *t) { - struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); + struct vxlan_dev *vxlan = timer_container_of(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; - unsigned int h; + struct vxlan_fdb *f; if (!netif_running(vxlan->dev)) return; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; - - spin_lock(&vxlan->hash_lock[h]); - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); - unsigned long timeout; + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { + unsigned long timeout; - if (f->state & (NUD_PERMANENT | NUD_NOARP)) - continue; + if (f->state & (NUD_PERMANENT | NUD_NOARP)) + continue; - if (f->flags & NTF_EXT_LEARNED) - continue; + if (f->flags & NTF_EXT_LEARNED) + continue; - timeout = f->used + vxlan->cfg.age_interval * HZ; - if (time_before_eq(timeout, jiffies)) { - netdev_dbg(vxlan->dev, - "garbage collect %pM\n", - f->eth_addr); + timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; + if (time_before_eq(timeout, jiffies)) { + spin_lock(&vxlan->hash_lock); + if (!hlist_unhashed(&f->fdb_node)) { + netdev_dbg(vxlan->dev, "garbage collect %pM\n", + f->key.eth_addr); f->state = NUD_STALE; vxlan_fdb_destroy(vxlan, f, true, true); - } else if (time_before(timeout, next_timer)) - next_timer = timeout; + } + spin_unlock(&vxlan->hash_lock); + } else if (time_before(timeout, next_timer)) { + next_timer = timeout; } - spin_unlock(&vxlan->hash_lock[h]); } + rcu_read_unlock(); mod_timer(&vxlan->age_timer, next_timer); } static void vxlan_vs_del_dev(struct vxlan_dev *vxlan) { - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + ASSERT_RTNL(); - spin_lock(&vn->sock_lock); hlist_del_init_rcu(&vxlan->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&vxlan->hlist6.hlist); #endif - spin_unlock(&vn->sock_lock); } static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan, struct vxlan_dev_node *node) { - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); __be32 vni = vxlan->default_dst.remote_vni; + ASSERT_RTNL(); + node->vxlan = vxlan; - spin_lock(&vn->sock_lock); hlist_add_head_rcu(&node->hlist, vni_head(vs, vni)); - spin_unlock(&vn->sock_lock); } /* Setup stats when device is created */ @@ -2908,47 +2898,37 @@ static int vxlan_init(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; - if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) - vxlan_vnigroup_init(vxlan); + err = rhashtable_init(&vxlan->fdb_hash_tbl, &vxlan_fdb_rht_params); + if (err) + return err; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) { - err = -ENOMEM; - goto err_vnigroup_uninit; + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { + err = vxlan_vnigroup_init(vxlan); + if (err) + goto err_rhashtable_destroy; } err = gro_cells_init(&vxlan->gro_cells, dev); if (err) - goto err_free_percpu; + goto err_vnigroup_uninit; err = vxlan_mdb_init(vxlan); if (err) goto err_gro_cells_destroy; + netdev_lockdep_set_classes(dev); return 0; err_gro_cells_destroy: gro_cells_destroy(&vxlan->gro_cells); -err_free_percpu: - free_percpu(dev->tstats); err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); +err_rhashtable_destroy: + rhashtable_destroy(&vxlan->fdb_hash_tbl); return err; } -static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) -{ - struct vxlan_fdb *f; - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); - f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); - if (f) - vxlan_fdb_destroy(vxlan, f, true, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); -} - static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -2960,9 +2940,7 @@ static void vxlan_uninit(struct net_device *dev) gro_cells_destroy(&vxlan->gro_cells); - vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); - - free_percpu(dev->tstats); + rhashtable_destroy(&vxlan->fdb_hash_tbl); } /* Start ageing timer and join group when device is brought up */ @@ -2987,40 +2965,226 @@ static int vxlan_open(struct net_device *dev) return ret; } +struct vxlan_fdb_flush_desc { + bool ignore_default_entry; + unsigned long state; + unsigned long state_mask; + unsigned long flags; + unsigned long flags_mask; + __be32 src_vni; + u32 nhid; + __be32 vni; + __be16 port; + union vxlan_addr dst_ip; +}; + +static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f, + const struct vxlan_dev *vxlan) +{ + return is_zero_ether_addr(f->key.eth_addr) && + f->key.vni == vxlan->cfg.vni; +} + +static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid) +{ + struct nexthop *nh = rtnl_dereference(f->nh); + + return nh && nh->id == nhid; +} + +static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f, + const struct vxlan_dev *vxlan, + const struct vxlan_fdb_flush_desc *desc) +{ + if (desc->state_mask && (f->state & desc->state_mask) != desc->state) + return false; + + if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags) + return false; + + if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan)) + return false; + + if (desc->src_vni && f->key.vni != desc->src_vni) + return false; + + if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid)) + return false; + + return true; +} + +static bool +vxlan_fdb_flush_should_match_remotes(const struct vxlan_fdb_flush_desc *desc) +{ + return desc->vni || desc->port || desc->dst_ip.sa.sa_family; +} + +static bool +vxlan_fdb_flush_remote_matches(const struct vxlan_fdb_flush_desc *desc, + const struct vxlan_rdst *rd) +{ + if (desc->vni && rd->remote_vni != desc->vni) + return false; + + if (desc->port && rd->remote_port != desc->port) + return false; + + if (desc->dst_ip.sa.sa_family && + !vxlan_addr_equal(&rd->remote_ip, &desc->dst_ip)) + return false; + + return true; +} + +static void +vxlan_fdb_flush_match_remotes(struct vxlan_fdb *f, struct vxlan_dev *vxlan, + const struct vxlan_fdb_flush_desc *desc, + bool *p_destroy_fdb) +{ + bool remotes_flushed = false; + struct vxlan_rdst *rd, *tmp; + + list_for_each_entry_safe(rd, tmp, &f->remotes, list) { + if (!vxlan_fdb_flush_remote_matches(desc, rd)) + continue; + + vxlan_fdb_dst_destroy(vxlan, f, rd, true); + remotes_flushed = true; + } + + *p_destroy_fdb = remotes_flushed && list_empty(&f->remotes); +} + /* Purge the forwarding table */ -static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all) +static void vxlan_flush(struct vxlan_dev *vxlan, + const struct vxlan_fdb_flush_desc *desc) { - unsigned int h; + bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); + struct vxlan_fdb *f; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { + if (!vxlan_fdb_flush_matches(f, vxlan, desc)) + continue; - spin_lock_bh(&vxlan->hash_lock[h]); - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); - if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP))) - continue; - /* the all_zeros_mac entry is deleted at vxlan_uninit */ - if (is_zero_ether_addr(f->eth_addr) && - f->vni == vxlan->cfg.vni) - continue; - vxlan_fdb_destroy(vxlan, f, true, true); + spin_lock_bh(&vxlan->hash_lock); + if (hlist_unhashed(&f->fdb_node)) + goto unlock; + + if (match_remotes) { + bool destroy_fdb = false; + + vxlan_fdb_flush_match_remotes(f, vxlan, desc, + &destroy_fdb); + + if (!destroy_fdb) + goto unlock; + } + + vxlan_fdb_destroy(vxlan, f, true, true); +unlock: + spin_unlock_bh(&vxlan->hash_lock); + } + rcu_read_unlock(); +} + +static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { + [NDA_SRC_VNI] = { .type = NLA_U32 }, + [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_DST] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), + sizeof(struct in6_addr)), + [NDA_NDM_STATE_MASK] = { .type = NLA_U16 }, + [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 }, +}; + +#define VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF) +#define VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP) +#define VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_EXT_LEARNED | NTF_OFFLOADED | \ + NTF_ROUTER) + +static int vxlan_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_fdb_flush_desc desc = {}; + struct ndmsg *ndm = nlmsg_data(nlh); + struct nlattr *tb[NDA_MAX + 1]; + u8 ndm_flags; + int err; + + ndm_flags = ndm->ndm_flags & ~VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS; + + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, vxlan_del_bulk_policy, + extack); + if (err) + return err; + + if (ndm_flags & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS) { + NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set"); + return -EINVAL; + } + if (ndm->ndm_state & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES) { + NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set"); + return -EINVAL; + } + + desc.state = ndm->ndm_state; + desc.flags = ndm_flags; + + if (tb[NDA_NDM_STATE_MASK]) + desc.state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]); + + if (tb[NDA_NDM_FLAGS_MASK]) + desc.flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]); + + if (tb[NDA_SRC_VNI]) + desc.src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); + + if (tb[NDA_NH_ID]) + desc.nhid = nla_get_u32(tb[NDA_NH_ID]); + + if (tb[NDA_VNI]) + desc.vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); + + if (tb[NDA_PORT]) + desc.port = nla_get_be16(tb[NDA_PORT]); + + if (tb[NDA_DST]) { + union vxlan_addr ip; + + err = vxlan_nla_get_addr(&ip, tb[NDA_DST]); + if (err) { + NL_SET_ERR_MSG_ATTR(extack, tb[NDA_DST], + "Unsupported address family"); + return err; } - spin_unlock_bh(&vxlan->hash_lock[h]); + desc.dst_ip = ip; } + + vxlan_flush(vxlan, &desc); + + return 0; } /* Cleanup timer and forwarding table on shutdown */ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_fdb_flush_desc desc = { + /* Default entry is deleted at vxlan_dellink. */ + .ignore_default_entry = true, + .state = 0, + .state_mask = NUD_PERMANENT | NUD_NOARP, + }; vxlan_multicast_leave(vxlan); - del_timer_sync(&vxlan->age_timer); + timer_delete_sync(&vxlan->age_timer); - vxlan_flush(vxlan, false); + vxlan_flush(vxlan, &desc); vxlan_sock_release(vxlan); return 0; @@ -3047,7 +3211,7 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } @@ -3065,11 +3229,14 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; - rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos, - info->key.u.ipv4.dst, - &info->key.u.ipv4.src, dport, sport, - info->key.flow_flags, &info->dst_cache, - info); + if (!sock4) + return -EIO; + + rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, 0, + &info->key.u.ipv4.src, + &info->key, + sport, dport, info->key.tos, + &info->dst_cache); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); @@ -3078,10 +3245,14 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct dst_entry *ndst; - ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos, - info->key.label, &info->key.u.ipv6.dst, - &info->key.u.ipv6.src, dport, sport, - &info->dst_cache, info); + if (!sock6) + return -EIO; + + ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, + 0, &info->key.u.ipv6.src, + &info->key, + sport, dport, info->key.tos, + &info->dst_cache); if (IS_ERR(ndst)) return PTR_ERR(ndst); dst_release(ndst); @@ -3100,18 +3271,20 @@ static const struct net_device_ops vxlan_netdev_ether_ops = { .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, - .ndo_get_stats64 = dev_get_tstats64, .ndo_set_rx_mode = vxlan_set_multicast_list, .ndo_change_mtu = vxlan_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_fdb_add = vxlan_fdb_add, .ndo_fdb_del = vxlan_fdb_delete, + .ndo_fdb_del_bulk = vxlan_fdb_delete_bulk, .ndo_fdb_dump = vxlan_fdb_dump, .ndo_fdb_get = vxlan_fdb_get, .ndo_mdb_add = vxlan_mdb_add, .ndo_mdb_del = vxlan_mdb_del, + .ndo_mdb_del_bulk = vxlan_mdb_del_bulk, .ndo_mdb_dump = vxlan_mdb_dump, + .ndo_mdb_get = vxlan_mdb_get, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; @@ -3121,13 +3294,12 @@ static const struct net_device_ops vxlan_netdev_raw_ops = { .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, - .ndo_get_stats64 = dev_get_tstats64, .ndo_change_mtu = vxlan_change_mtu, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; /* Info for udev, that this is a virtual tunnel endpoint */ -static struct device_type vxlan_type = { +static const struct device_type vxlan_type = { .name = "vxlan", }; @@ -3142,9 +3314,10 @@ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int i; - spin_lock(&vn->sock_lock); + ASSERT_RTNL(); + for (i = 0; i < PORT_HASH_SIZE; ++i) { - hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { + hlist_for_each_entry(vs, &vn->sock_list[i], hlist) { unsigned short type; if (vs->flags & VXLAN_F_GPE) @@ -3158,14 +3331,12 @@ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) udp_tunnel_drop_rx_port(dev, vs->sock, type); } } - spin_unlock(&vn->sock_lock); } /* Initialize the device structure. */ static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; eth_hw_addr_random(dev); ether_setup(dev); @@ -3173,7 +3344,6 @@ static void vxlan_setup(struct net_device *dev) dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &vxlan_type); - dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; @@ -3183,22 +3353,23 @@ static void vxlan_setup(struct net_device *dev) dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; netif_keep_dst(dev); - dev->priv_flags |= IFF_NO_QUEUE | IFF_CHANGE_PROTO_DOWN; + dev->priv_flags |= IFF_NO_QUEUE; + dev->change_proto_down = true; + dev->lltx = true; /* MTU range: 68 - 65535 */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_MAX_MTU; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); + spin_lock_init(&vxlan->hash_lock); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_init(&vxlan->hash_lock[h]); - INIT_HLIST_HEAD(&vxlan->fdb_head[h]); - } + INIT_HLIST_HEAD(&vxlan->fdb_list); } static void vxlan_ether_setup(struct net_device *dev) @@ -3251,6 +3422,9 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), + [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), + [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), + [IFLA_VXLAN_MC_ROUTE] = NLA_POLICY_MAX(NLA_U8, 1), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -3387,12 +3561,13 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, __be16 port, u32 flags, int ifindex) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; unsigned int h; struct udp_tunnel_sock_cfg tunnel_cfg; + ASSERT_RTNL(); + vs = kzalloc(sizeof(*vs), GFP_KERNEL); if (!vs) return ERR_PTR(-ENOMEM); @@ -3410,13 +3585,11 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, refcount_set(&vs->refcnt, 1); vs->flags = (flags & VXLAN_F_RCV_FLAGS); - spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); udp_tunnel_notify_add_rx_port(sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); - spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); @@ -3440,26 +3613,27 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) { - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; struct vxlan_sock *vs = NULL; struct vxlan_dev_node *node; int l3mdev_index = 0; + ASSERT_RTNL(); + if (vxlan->cfg.remote_ifindex) l3mdev_index = l3mdev_master_upper_ifindex_by_index( vxlan->net, vxlan->cfg.remote_ifindex); if (!vxlan->cfg.no_share) { - spin_lock(&vn->sock_lock); + rcu_read_lock(); vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, vxlan->cfg.dst_port, vxlan->cfg.flags, l3mdev_index); if (vs && !refcount_inc_not_zero(&vs->refcnt)) { - spin_unlock(&vn->sock_lock); + rcu_read_unlock(); return -EBUSY; } - spin_unlock(&vn->sock_lock); + rcu_read_unlock(); } if (!vs) vs = vxlan_socket_create(vxlan->net, ipv6, @@ -3625,6 +3799,12 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, return -EINVAL; } + if (conf->label_policy && !use_ipv6) { + NL_SET_ERR_MSG(extack, + "Label policy only applies to IPv6 VXLAN devices"); + return -EINVAL; + } + if (conf->remote_ifindex) { struct net_device *lowerdev; @@ -3744,7 +3924,7 @@ static void vxlan_config_apply(struct net_device *dev, } static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, - struct vxlan_config *conf, bool changelink, + struct vxlan_config *conf, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -3755,7 +3935,7 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, if (ret) return ret; - vxlan_config_apply(dev, conf, lowerdev, src_net, changelink); + vxlan_config_apply(dev, conf, lowerdev, src_net, false); return 0; } @@ -3767,84 +3947,64 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; - struct vxlan_fdb *f = NULL; - bool unregister = false; struct vxlan_rdst *dst; int err; dst = &vxlan->default_dst; - err = vxlan_dev_configure(net, dev, conf, false, extack); + err = vxlan_dev_configure(net, dev, conf, extack); if (err) return err; dev->ethtool_ops = &vxlan_ethtool_ops; - /* create an fdb entry for a valid default destination */ - if (!vxlan_addr_any(&dst->remote_ip)) { - err = vxlan_fdb_create(vxlan, all_zeros_mac, - &dst->remote_ip, - NUD_REACHABLE | NUD_PERMANENT, - vxlan->cfg.dst_port, - dst->remote_vni, - dst->remote_vni, - dst->remote_ifindex, - NTF_SELF, 0, &f, extack); - if (err) - return err; - } - err = register_netdevice(dev); if (err) - goto errout; - unregister = true; + return err; if (dst->remote_ifindex) { remote_dev = __dev_get_by_index(net, dst->remote_ifindex); if (!remote_dev) { err = -ENODEV; - goto errout; + goto unregister; } err = netdev_upper_dev_link(remote_dev, dev, extack); if (err) - goto errout; + goto unregister; + + dst->remote_dev = remote_dev; } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto unlink; - if (f) { - vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f); - - /* notify default fdb entry */ - err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), - RTM_NEWNEIGH, true, extack); - if (err) { - vxlan_fdb_destroy(vxlan, f, false, false); - if (remote_dev) - netdev_upper_dev_unlink(remote_dev, dev); - goto unregister; - } + /* create an fdb entry for a valid default destination */ + if (!vxlan_addr_any(&dst->remote_ip)) { + spin_lock_bh(&vxlan->hash_lock); + err = vxlan_fdb_update(vxlan, all_zeros_mac, + &dst->remote_ip, + NUD_REACHABLE | NUD_PERMANENT, + NLM_F_EXCL | NLM_F_CREATE, + vxlan->cfg.dst_port, + dst->remote_vni, + dst->remote_vni, + dst->remote_ifindex, + NTF_SELF, 0, true, extack); + spin_unlock_bh(&vxlan->hash_lock); + if (err) + goto unlink; } list_add(&vxlan->next, &vn->vxlan_list); - if (remote_dev) - dst->remote_dev = remote_dev; + return 0; + unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); -errout: - /* unregister_netdevice() destroys the default FDB entry with deletion - * notification. But the addition notification was not sent yet, so - * destroy the entry by hand here. - */ - if (f) - __vxlan_fdb_free(f); unregister: - if (unregister) - unregister_netdevice(dev); + unregister_netdevice(dev); return err; } @@ -3880,6 +4040,10 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { + struct vxlanhdr used_bits = { + .vx_flags = VXLAN_HF_VNI, + .vx_vni = VXLAN_VNI_MASK, + }; struct vxlan_dev *vxlan = netdev_priv(dev); int err = 0; @@ -3896,7 +4060,7 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI"); return -EOPNOTSUPP; } - conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); + conf->vni = vni; } if (data[IFLA_VXLAN_GROUP]) { @@ -3967,6 +4131,8 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], if (data[IFLA_VXLAN_LABEL]) conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) & IPV6_FLOWLABEL_MASK; + if (data[IFLA_VXLAN_LABEL_POLICY]) + conf->label_policy = nla_get_u32(data[IFLA_VXLAN_LABEL_POLICY]); if (data[IFLA_VXLAN_LEARNING]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING, @@ -4104,6 +4270,8 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + used_bits.vx_flags |= VXLAN_HF_RCO; + used_bits.vx_vni |= ~VXLAN_VNI_MASK; } if (data[IFLA_VXLAN_GBP]) { @@ -4111,6 +4279,7 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], VXLAN_F_GBP, changelink, false, extack); if (err) return err; + used_bits.vx_flags |= VXLAN_GBP_USED_BITS; } if (data[IFLA_VXLAN_GPE]) { @@ -4119,6 +4288,46 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + + used_bits.vx_flags |= VXLAN_GPE_USED_BITS; + } + + if (data[IFLA_VXLAN_RESERVED_BITS]) { + struct vxlanhdr reserved_bits; + + if (changelink) { + NL_SET_ERR_MSG_ATTR(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Cannot change reserved_bits"); + return -EOPNOTSUPP; + } + + nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS], + sizeof(reserved_bits)); + if (used_bits.vx_flags & reserved_bits.vx_flags || + used_bits.vx_vni & reserved_bits.vx_vni) { + __be64 ub_be64, rb_be64; + + memcpy(&ub_be64, &used_bits, sizeof(ub_be64)); + memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64)); + + NL_SET_ERR_MSG_ATTR_FMT(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Used bits %#018llx cannot overlap reserved bits %#018llx", + be64_to_cpu(ub_be64), + be64_to_cpu(rb_be64)); + return -EINVAL; + } + + conf->reserved_bits = reserved_bits; + } else { + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + conf->reserved_bits = (struct vxlanhdr) { + .vx_flags = ~used_bits.vx_flags, + .vx_vni = ~used_bits.vx_vni, + }; } if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { @@ -4129,6 +4338,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], return err; } + if (data[IFLA_VXLAN_MC_ROUTE]) { + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_MC_ROUTE, + VXLAN_F_MC_ROUTE, changelink, + true, extack); + if (err) + return err; + } + if (tb[IFLA_MTU]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], @@ -4159,10 +4376,13 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], return 0; } -static int vxlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vxlan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct vxlan_config conf; int err; @@ -4170,7 +4390,7 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, if (err) return err; - return __vxlan_dev_create(src_net, dev, &conf, extack); + return __vxlan_dev_create(link_net, dev, &conf, extack); } static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], @@ -4178,6 +4398,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); + bool rem_ip_changed, change_igmp; struct net_device *lowerdev; struct vxlan_config conf; struct vxlan_rdst *dst; @@ -4201,11 +4422,14 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], if (err) return err; - /* handle default dst entry */ - if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) { - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni); + rem_ip_changed = !vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip); + change_igmp = vxlan->dev->flags & IFF_UP && + (rem_ip_changed || + dst->remote_ifindex != conf.remote_ifindex); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + /* handle default dst entry */ + if (rem_ip_changed) { + spin_lock_bh(&vxlan->hash_lock); if (!vxlan_addr_any(&conf.remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, &conf.remote_ip, @@ -4216,7 +4440,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], conf.remote_ifindex, NTF_SELF, 0, true, extack); if (err) { - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; @@ -4230,7 +4454,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], dst->remote_vni, dst->remote_ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); /* If vni filtering device, also update fdb entries of * all vnis that were using default remote ip @@ -4246,6 +4470,9 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], } } + if (change_igmp && vxlan_addr_multicast(&dst->remote_ip)) + err = vxlan_multicast_leave(vxlan); + if (conf.age_interval != vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies); @@ -4253,14 +4480,20 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], if (lowerdev && lowerdev != dst->remote_dev) dst->remote_dev = lowerdev; vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true); - return 0; + + if (!err && change_igmp && + vxlan_addr_multicast(&dst->remote_ip)) + err = vxlan_multicast_join(vxlan); + + return err; } static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_fdb_flush_desc desc = {}; - vxlan_flush(vxlan, true); + vxlan_flush(vxlan, &desc); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); @@ -4270,7 +4503,6 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) static size_t vxlan_get_size(const struct net_device *dev) { - return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ @@ -4280,6 +4512,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ + nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LABEL_POLICY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ @@ -4288,7 +4521,6 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ - nla_total_size(sizeof(struct ifla_vxlan_port_range)) + nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ @@ -4296,6 +4528,14 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ + /* IFLA_VXLAN_PORT_RANGE */ + nla_total_size(sizeof(struct ifla_vxlan_port_range)) + + nla_total_size(0) + /* IFLA_VXLAN_GBP */ + nla_total_size(0) + /* IFLA_VXLAN_GPE */ + nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ + /* IFLA_VXLAN_RESERVED_BITS */ + nla_total_size(sizeof(struct vxlanhdr)) + 0; } @@ -4348,6 +4588,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) || nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || + nla_put_u32(skb, IFLA_VXLAN_LABEL_POLICY, vxlan->cfg.label_policy) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, !!(vxlan->cfg.flags & VXLAN_F_LEARN)) || nla_put_u8(skb, IFLA_VXLAN_PROXY, @@ -4397,6 +4638,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) goto nla_put_failure; + if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS, + sizeof(vxlan->cfg.reserved_bits), + &vxlan->cfg.reserved_bits)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -4407,7 +4653,7 @@ static struct net *vxlan_get_link_net(const struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - return vxlan->net; + return READ_ONCE(vxlan->net); } static struct rtnl_link_ops vxlan_link_ops __read_mostly = { @@ -4508,11 +4754,8 @@ vxlan_fdb_offloaded_set(struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst; struct vxlan_fdb *f; - u32 hash_index; - - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) @@ -4528,7 +4771,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev, rdst->offloaded = fdb_info->offloaded; out: - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } static int @@ -4537,13 +4780,11 @@ vxlan_fdb_external_learn_add(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct netlink_ext_ack *extack; - u32 hash_index; int err; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); extack = switchdev_notifier_info_to_extack(&fdb_info->info); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, NUD_REACHABLE, NLM_F_CREATE | NLM_F_REPLACE, @@ -4553,7 +4794,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 0, false, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4564,11 +4805,9 @@ vxlan_fdb_external_learn_del(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; - u32 hash_index; int err = 0; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) @@ -4582,7 +4821,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev, fdb_info->remote_ifindex, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4631,18 +4870,15 @@ static void vxlan_fdb_nh_flush(struct nexthop *nh) { struct vxlan_fdb *fdb; struct vxlan_dev *vxlan; - u32 hash_index; rcu_read_lock(); list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); - hash_index = fdb_head_index(vxlan, fdb->eth_addr, - vxlan->default_dst.remote_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); - if (!hlist_unhashed(&fdb->hlist)) + spin_lock_bh(&vxlan->hash_lock); + if (!hlist_unhashed(&fdb->fdb_node)) vxlan_fdb_destroy(vxlan, fdb, false, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } rcu_read_unlock(); } @@ -4671,7 +4907,6 @@ static __net_init int vxlan_init_net(struct net *net) unsigned int h; INIT_LIST_HEAD(&vn->vxlan_list); - spin_lock_init(&vn->sock_lock); vn->nexthop_notifier_block.notifier_call = vxlan_nexthop_event; for (h = 0; h < PORT_HASH_SIZE; ++h) @@ -4681,55 +4916,39 @@ static __net_init int vxlan_init_net(struct net *net) NULL); } -static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit vxlan_destroy_tunnels(struct vxlan_net *vn, + struct list_head *dev_to_kill) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan, *next; - struct net_device *dev, *aux; - - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &vxlan_link_ops) - unregister_netdevice_queue(dev, head); - - list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { - /* If vxlan->dev is in the same netns, it has already been added - * to the list by the previous loop. - */ - if (!net_eq(dev_net(vxlan->dev), net)) - unregister_netdevice_queue(vxlan->dev, head); - } + list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) + vxlan_dellink(vxlan->dev, dev_to_kill); } -static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) +static void __net_exit vxlan_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; - LIST_HEAD(list); - unsigned int h; + struct vxlan_net *vn = net_generic(net, vxlan_net_id); - list_for_each_entry(net, net_list, exit_list) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); + ASSERT_RTNL_NET(net); - unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); - } - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - vxlan_destroy_tunnels(net, &list); - - unregister_netdevice_many(&list); - rtnl_unlock(); + __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); + vxlan_destroy_tunnels(vn, dev_to_kill); +} - list_for_each_entry(net, net_list, exit_list) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); +static void __net_exit vxlan_exit_net(struct net *net) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + unsigned int h; - for (h = 0; h < PORT_HASH_SIZE; ++h) - WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); - } + for (h = 0; h < PORT_HASH_SIZE; ++h) + WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); } static struct pernet_operations vxlan_net_ops = { .init = vxlan_init_net, - .exit_batch = vxlan_exit_batch_net, + .exit_rtnl = vxlan_exit_rtnl, + .exit = vxlan_exit_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), }; @@ -4738,8 +4957,6 @@ static int __init vxlan_init_module(void) { int rc; - get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); - rc = register_pernet_subsys(&vxlan_net_ops); if (rc) goto out1; @@ -4756,9 +4973,13 @@ static int __init vxlan_init_module(void) if (rc) goto out4; - vxlan_vnifilter_init(); + rc = vxlan_vnifilter_init(); + if (rc) + goto out5; return 0; +out5: + rtnl_link_unregister(&vxlan_link_ops); out4: unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); out3: |
