diff options
Diffstat (limited to 'drivers/net/vxlan')
-rw-r--r-- | drivers/net/vxlan/vxlan_core.c | 443 | ||||
-rw-r--r-- | drivers/net/vxlan/vxlan_mdb.c | 4 | ||||
-rw-r--r-- | drivers/net/vxlan/vxlan_private.h | 2 | ||||
-rw-r--r-- | drivers/net/vxlan/vxlan_vnifilter.c | 24 |
4 files changed, 289 insertions, 184 deletions
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 16106e088c63..92516189e792 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -277,8 +277,7 @@ static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, @@ -623,9 +622,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, return 1; } -static bool vxlan_parse_gpe_proto(struct vxlanhdr *hdr, __be16 *protocol) +static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol) { - struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)hdr; + const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr; /* Need to have Next Protocol set for interfaces in GPE mode. */ if (!gpe->np_applied) @@ -1233,10 +1232,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, *ifindex = 0; } - if (tb[NDA_NH_ID]) - *nhid = nla_get_u32(tb[NDA_NH_ID]); - else - *nhid = 0; + *nhid = nla_get_u32_default(tb[NDA_NH_ID], 0); return 0; } @@ -1245,7 +1241,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ @@ -1281,6 +1277,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); + if (!err) + *notified = true; + return err; } @@ -1320,7 +1319,7 @@ out: /* Delete entry (via netlink) */ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -1342,6 +1341,9 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); + if (!err) + *notified = true; + return err; } @@ -1350,6 +1352,7 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; int err = 0; @@ -1362,7 +1365,7 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct vxlan_rdst *rd; if (rcu_access_pointer(f->nh)) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip_nh; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, @@ -1379,7 +1382,7 @@ skip_nh: } list_for_each_entry_rcu(rd, &f->remotes, list) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; err = vxlan_fdb_info(skb, vxlan, f, @@ -1436,16 +1439,20 @@ errout: /* Watch incoming packets to learn mapping between Ethernet address * and Tunnel endpoint. - * Return true if packet is bogus and should be dropped. */ -static bool vxlan_snoop(struct net_device *dev, - union vxlan_addr *src_ip, const u8 *src_mac, - u32 src_ifindex, __be32 vni) +static enum skb_drop_reason vxlan_snoop(struct net_device *dev, + union vxlan_addr *src_ip, + const u8 *src_mac, u32 src_ifindex, + __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; u32 ifindex = 0; + /* Ignore packets from invalid src-address */ + if (!is_valid_ether_addr(src_mac)) + return SKB_DROP_REASON_MAC_INVALID_SOURCE; + #if IS_ENABLED(CONFIG_IPV6) if (src_ip->sa.sa_family == AF_INET6 && (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)) @@ -1458,15 +1465,15 @@ static bool vxlan_snoop(struct net_device *dev, if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && rdst->remote_ifindex == ifindex)) - return false; + return SKB_NOT_DROPPED_YET; /* Don't migrate static entries, drop packets */ if (f->state & (NUD_PERMANENT | NUD_NOARP)) - return true; + return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; /* Don't override an fdb with nexthop with a learnt entry */ if (rcu_access_pointer(f->nh)) - return true; + return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; if (net_ratelimit()) netdev_info(dev, @@ -1494,7 +1501,7 @@ static bool vxlan_snoop(struct net_device *dev, spin_unlock(&vxlan->hash_lock[hash_index]); } - return false; + return SKB_NOT_DROPPED_YET; } static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) @@ -1548,43 +1555,45 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan) #endif } -static bool vxlan_remcsum(struct vxlanhdr *unparsed, - struct sk_buff *skb, u32 vxflags) +static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags) { + const struct vxlanhdr *vh = vxlan_hdr(skb); + enum skb_drop_reason reason; size_t start, offset; - if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) - goto out; + if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) + return SKB_NOT_DROPPED_YET; - start = vxlan_rco_start(unparsed->vx_vni); - offset = start + vxlan_rco_offset(unparsed->vx_vni); + start = vxlan_rco_start(vh->vx_vni); + offset = start + vxlan_rco_offset(vh->vx_vni); - if (!pskb_may_pull(skb, offset + sizeof(u16))) - return false; + reason = pskb_may_pull_reason(skb, offset + sizeof(u16)); + if (reason) + return reason; skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); -out: - unparsed->vx_flags &= ~VXLAN_HF_RCO; - unparsed->vx_vni &= VXLAN_VNI_MASK; - return true; + return SKB_NOT_DROPPED_YET; } -static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, - struct sk_buff *skb, u32 vxflags, +static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags, struct vxlan_metadata *md) { - struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; + const struct vxlanhdr *vh = vxlan_hdr(skb); + const struct vxlanhdr_gbp *gbp; struct metadata_dst *tun_dst; - if (!(unparsed->vx_flags & VXLAN_HF_GBP)) - goto out; + gbp = (const struct vxlanhdr_gbp *)vh; + + if (!(vh->vx_flags & VXLAN_HF_GBP)) + return; md->gbp = ntohs(gbp->policy_id); tun_dst = (struct metadata_dst *)skb_dst(skb); if (tun_dst) { - tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, + tun_dst->u.tun_info.key.tun_flags); tun_dst->u.tun_info.options_len = sizeof(*md); } if (gbp->dont_learn) @@ -1596,13 +1605,11 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, /* In flow-based mode, GBP is carried in dst_metadata */ if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; -out: - unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; } -static bool vxlan_set_mac(struct vxlan_dev *vxlan, - struct vxlan_sock *vs, - struct sk_buff *skb, __be32 vni) +static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan, + struct vxlan_sock *vs, + struct sk_buff *skb, __be32 vni) { union vxlan_addr saddr; u32 ifindex = skb->dev->ifindex; @@ -1613,7 +1620,7 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan, /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) - return false; + return SKB_DROP_REASON_LOCAL_MAC; /* Get address from the outer IP header */ if (vxlan_get_sk_family(vs) == AF_INET) { @@ -1626,11 +1633,11 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan, #endif } - if ((vxlan->cfg.flags & VXLAN_F_LEARN) && - vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni)) - return false; + if (!(vxlan->cfg.flags & VXLAN_F_LEARN)) + return SKB_NOT_DROPPED_YET; - return true; + return vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, + ifindex, vni); } static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, @@ -1661,68 +1668,93 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { struct vxlan_vni_node *vninode = NULL; + const struct vxlanhdr *vh; struct vxlan_dev *vxlan; struct vxlan_sock *vs; - struct vxlanhdr unparsed; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); + enum skb_drop_reason reason; bool raw_proto = false; void *oiph; __be32 vni = 0; + int nh; /* Need UDP and VXLAN header to be present */ - if (!pskb_may_pull(skb, VXLAN_HLEN)) + reason = pskb_may_pull_reason(skb, VXLAN_HLEN); + if (reason) goto drop; - unparsed = *vxlan_hdr(skb); + vh = vxlan_hdr(skb); /* VNI flag always required to be set */ - if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { + if (!(vh->vx_flags & VXLAN_HF_VNI)) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxlan_hdr(skb)->vx_flags), - ntohl(vxlan_hdr(skb)->vx_vni)); + ntohl(vh->vx_flags), ntohl(vh->vx_vni)); + reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; /* Return non vxlan pkt */ goto drop; } - unparsed.vx_flags &= ~VXLAN_HF_VNI; - unparsed.vx_vni &= ~VXLAN_VNI_MASK; vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; - vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); + vni = vxlan_vni(vh->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); - if (!vxlan) + if (!vxlan) { + reason = SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND; goto drop; + } - /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ - if (vs->flags & VXLAN_F_GPE) { - if (!vxlan_parse_gpe_proto(&unparsed, &protocol)) + if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags || + vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) { + /* If the header uses bits besides those enabled by the + * netdevice configuration, treat this as a malformed packet. + * This behavior diverges from VXLAN RFC (RFC7348) which + * stipulates that bits in reserved in reserved fields are to be + * ignored. The approach here maintains compatibility with + * previous stack code, and also is more robust and provides a + * little more security in adding extensions to VXLAN. + */ + reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; + DEV_STATS_INC(vxlan->dev, rx_frame_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_ERRORS, 0); + goto drop; + } + + if (vxlan->cfg.flags & VXLAN_F_GPE) { + if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; - unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; } if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, - !net_eq(vxlan->net, dev_net(vxlan->dev)))) + !net_eq(vxlan->net, dev_net(vxlan->dev)))) { + reason = SKB_DROP_REASON_NOMEM; goto drop; + } - if (vs->flags & VXLAN_F_REMCSUM_RX) - if (unlikely(!vxlan_remcsum(&unparsed, skb, vs->flags))) + if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) { + reason = vxlan_remcsum(skb, vxlan->cfg.flags); + if (unlikely(reason)) goto drop; + } if (vxlan_collect_metadata(vs)) { + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst; - tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY, + __set_bit(IP_TUNNEL_KEY_BIT, flags); + tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), flags, key32_to_tunnel_id(vni), sizeof(*md)); - if (!tun_dst) + if (!tun_dst) { + reason = SKB_DROP_REASON_NOMEM; goto drop; + } md = ip_tunnel_info_opts(&tun_dst->u.tun_info); @@ -1731,26 +1763,15 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - if (vs->flags & VXLAN_F_GBP) - vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); + if (vxlan->cfg.flags & VXLAN_F_GBP) + vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md); /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ - if (unparsed.vx_flags || unparsed.vx_vni) { - /* If there are any unprocessed flags remaining treat - * this as a malformed packet. This behavior diverges from - * VXLAN RFC (RFC7348) which stipulates that bits in reserved - * in reserved fields are to be ignored. The approach here - * maintains compatibility with previous stack code, and also - * is more robust and provides a little more security in - * adding extensions to VXLAN. - */ - goto drop; - } - if (!raw_proto) { - if (!vxlan_set_mac(vxlan, vs, skb, vni)) + reason = vxlan_set_mac(vxlan, vs, skb, vni); + if (reason) goto drop; } else { skb_reset_mac_header(skb); @@ -1758,12 +1779,30 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; } - oiph = skb_network_header(skb); + /* Save offset of outer header relative to skb->head, + * because we are going to reset the network header to the inner header + * and might change skb->head. + */ + nh = skb_network_header(skb) - skb->head; + skb_reset_network_header(skb); + reason = pskb_inet_may_pull_reason(skb); + if (reason) { + DEV_STATS_INC(vxlan->dev, rx_length_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_ERRORS, 0); + goto drop; + } + + /* Get the outer header. */ + oiph = skb->head + nh; + if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { - ++vxlan->dev->stats.rx_frame_errors; - ++vxlan->dev->stats.rx_errors; + reason = SKB_DROP_REASON_IP_TUNNEL_ECN; + DEV_STATS_INC(vxlan->dev, rx_frame_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; @@ -1773,13 +1812,14 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); - dev_core_stats_rx_dropped_inc(vxlan->dev); + dev_dstats_rx_dropped(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); + reason = SKB_DROP_REASON_DEV_READY; goto drop; } - dev_sw_netstats_rx_add(vxlan->dev, skb->len); + dev_dstats_rx_add(vxlan->dev, skb->len); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len); gro_cells_receive(&vxlan->gro_cells, skb); @@ -1788,8 +1828,9 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) return 0; drop: + reason = reason ?: SKB_DROP_REASON_NOT_SPECIFIED; /* Consume bad packet */ - kfree_skb(skb); + kfree_skb_reason(skb, reason); return 0; } @@ -1833,7 +1874,9 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (!pskb_may_pull(skb, arp_hdr_len(dev))) { - dev->stats.tx_dropped++; + dev_dstats_tx_dropped(dev); + vxlan_vnifilter_count(vxlan, vni, NULL, + VXLAN_VNI_STATS_TX_DROPS, 0); goto out; } parp = arp_hdr(skb); @@ -1889,7 +1932,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) reply->pkt_type = PACKET_HOST; if (netif_rx(reply) == NET_RX_DROP) { - dev->stats.rx_dropped++; + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2048,7 +2091,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (netif_rx(reply) == NET_RX_DROP) { - dev->stats.rx_dropped++; + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2222,8 +2265,8 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, { union vxlan_addr loopback; union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; + unsigned int len = skb->len; struct net_device *dev; - int len = skb->len; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; @@ -2243,23 +2286,23 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, rcu_read_lock(); dev = skb->dev; if (unlikely(!(dev->flags & IFF_UP))) { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY); goto drop; } if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop) vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni); - dev_sw_netstats_tx_add(src_vxlan->dev, 1, len); + dev_dstats_tx_add(src_vxlan->dev, len); vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len); if (__netif_rx(skb) == NET_RX_SUCCESS) { - dev_sw_netstats_rx_add(dst_vxlan->dev, len); + dev_dstats_rx_add(dst_vxlan->dev, len); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX, len); } else { drop: - dev->stats.rx_dropped++; + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2291,10 +2334,10 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, addr_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) { - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND); return -ENOENT; } @@ -2313,7 +2356,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_key *pkey; struct ip_tunnel_key key; struct vxlan_dev *vxlan = netdev_priv(dev); - const struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *old_iph; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; unsigned int pkt_len = skb->len; @@ -2327,8 +2370,18 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, bool use_cache; bool udp_sum = false; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); + enum skb_drop_reason reason; + bool no_eth_encap; __be32 vni = 0; + no_eth_encap = flags & VXLAN_F_GPE && skb->protocol != htons(ETH_P_TEB); + reason = skb_vlan_inet_prepare(skb, no_eth_encap); + if (reason) + goto drop; + + reason = SKB_DROP_REASON_NOT_SPECIFIED; + old_iph = ip_hdr(skb); + info = skb_tunnel_info(skb); use_cache = ip_tunnel_dst_cache_usable(skb, info); @@ -2403,14 +2456,14 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vni = tunnel_id_to_key32(info->key.tun_id); ifindex = 0; dst_cache = &info->dst_cache; - if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { if (info->options_len < sizeof(*md)) goto drop; md = ip_tunnel_info_opts(info); } ttl = info->key.ttl; tos = info->key.tos; - udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); + udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); } src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); @@ -2430,6 +2483,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos, use_cache ? dst_cache : NULL); if (IS_ERR(rt)) { err = PTR_ERR(rt); + reason = SKB_DROP_REASON_IP_OUTNOROUTES; goto tx_error; } @@ -2451,7 +2505,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, old_iph->frag_off & htons(IP_DF))) df = htons(IP_DF); } - } else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) { + } else if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + info->key.tun_flags)) { df = htons(IP_DF); } @@ -2480,8 +2535,10 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), vni, md, flags, udp_sum); - if (err < 0) + if (err < 0) { + reason = SKB_DROP_REASON_NOMEM; goto tx_error; + } udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, @@ -2501,11 +2558,12 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(ndst)) { err = PTR_ERR(ndst); ndst = NULL; + reason = SKB_DROP_REASON_IP_OUTNOROUTES; goto tx_error; } if (!info) { - u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; + u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags; err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6, dst_port, ifindex, vni, @@ -2541,8 +2599,10 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), vni, md, flags, udp_sum); - if (err < 0) + if (err < 0) { + reason = SKB_DROP_REASON_NOMEM; goto tx_error; + } udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, @@ -2555,21 +2615,21 @@ out_unlock: return; drop: - dev->stats.tx_dropped++; + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); - dev_kfree_skb(skb); + kfree_skb_reason(skb, reason); return; tx_error: rcu_read_unlock(); if (err == -ELOOP) - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); dst_release(ndst); - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); - kfree_skb(skb); + kfree_skb_reason(skb, reason); } static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, @@ -2600,7 +2660,7 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, return; drop: - dev->stats.tx_dropped++; + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); @@ -2638,7 +2698,7 @@ static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev, return NETDEV_TX_OK; drop: - dev->stats.tx_dropped++; + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); @@ -2656,11 +2716,11 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst, *fdst = NULL; const struct ip_tunnel_info *info; - bool did_rsc = false; struct vxlan_fdb *f; struct ethhdr *eth; __be32 vni = 0; u32 nhid = 0; + bool did_rsc; info = skb_tunnel_info(skb); @@ -2675,7 +2735,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) if (info && info->mode & IP_TUNNEL_INFO_TX) vxlan_xmit_one(skb, dev, vni, NULL, false); else - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TUNNEL_TXINFO); return NETDEV_TX_OK; } } @@ -2735,10 +2795,10 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); - dev->stats.tx_dropped++; + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); return NETDEV_TX_OK; } } @@ -2761,7 +2821,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) if (fdst) vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); else - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); } return NETDEV_TX_OK; @@ -2838,29 +2898,25 @@ static int vxlan_init(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; - if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) - vxlan_vnigroup_init(vxlan); - - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) { - err = -ENOMEM; - goto err_vnigroup_uninit; + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { + err = vxlan_vnigroup_init(vxlan); + if (err) + return err; } err = gro_cells_init(&vxlan->gro_cells, dev); if (err) - goto err_free_percpu; + goto err_vnigroup_uninit; err = vxlan_mdb_init(vxlan); if (err) goto err_gro_cells_destroy; + netdev_lockdep_set_classes(dev); return 0; err_gro_cells_destroy: gro_cells_destroy(&vxlan->gro_cells); -err_free_percpu: - free_percpu(dev->tstats); err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); @@ -2891,8 +2947,6 @@ static void vxlan_uninit(struct net_device *dev) gro_cells_destroy(&vxlan->gro_cells); vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); - - free_percpu(dev->tstats); } /* Start ageing timer and join group when device is brought up */ @@ -3163,7 +3217,7 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } @@ -3223,7 +3277,6 @@ static const struct net_device_ops vxlan_netdev_ether_ops = { .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, - .ndo_get_stats64 = dev_get_tstats64, .ndo_set_rx_mode = vxlan_set_multicast_list, .ndo_change_mtu = vxlan_change_mtu, .ndo_validate_addr = eth_validate_addr, @@ -3247,13 +3300,12 @@ static const struct net_device_ops vxlan_netdev_raw_ops = { .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, - .ndo_get_stats64 = dev_get_tstats64, .ndo_change_mtu = vxlan_change_mtu, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; /* Info for udev, that this is a virtual tunnel endpoint */ -static struct device_type vxlan_type = { +static const struct device_type vxlan_type = { .name = "vxlan", }; @@ -3299,7 +3351,6 @@ static void vxlan_setup(struct net_device *dev) dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &vxlan_type); - dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; @@ -3309,12 +3360,15 @@ static void vxlan_setup(struct net_device *dev) dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; netif_keep_dst(dev); - dev->priv_flags |= IFF_NO_QUEUE | IFF_CHANGE_PROTO_DOWN; + dev->priv_flags |= IFF_NO_QUEUE; + dev->change_proto_down = true; + dev->lltx = true; /* MTU range: 68 - 65535 */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_MAX_MTU; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); @@ -3378,6 +3432,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), + [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4013,6 +4068,10 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { + struct vxlanhdr used_bits = { + .vx_flags = VXLAN_HF_VNI, + .vx_vni = VXLAN_VNI_MASK, + }; struct vxlan_dev *vxlan = netdev_priv(dev); int err = 0; @@ -4239,6 +4298,8 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + used_bits.vx_flags |= VXLAN_HF_RCO; + used_bits.vx_vni |= ~VXLAN_VNI_MASK; } if (data[IFLA_VXLAN_GBP]) { @@ -4246,6 +4307,7 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], VXLAN_F_GBP, changelink, false, extack); if (err) return err; + used_bits.vx_flags |= VXLAN_GBP_USED_BITS; } if (data[IFLA_VXLAN_GPE]) { @@ -4254,6 +4316,46 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + + used_bits.vx_flags |= VXLAN_GPE_USED_BITS; + } + + if (data[IFLA_VXLAN_RESERVED_BITS]) { + struct vxlanhdr reserved_bits; + + if (changelink) { + NL_SET_ERR_MSG_ATTR(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Cannot change reserved_bits"); + return -EOPNOTSUPP; + } + + nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS], + sizeof(reserved_bits)); + if (used_bits.vx_flags & reserved_bits.vx_flags || + used_bits.vx_vni & reserved_bits.vx_vni) { + __be64 ub_be64, rb_be64; + + memcpy(&ub_be64, &used_bits, sizeof(ub_be64)); + memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64)); + + NL_SET_ERR_MSG_ATTR_FMT(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Used bits %#018llx cannot overlap reserved bits %#018llx", + be64_to_cpu(ub_be64), + be64_to_cpu(rb_be64)); + return -EINVAL; + } + + conf->reserved_bits = reserved_bits; + } else { + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + conf->reserved_bits = (struct vxlanhdr) { + .vx_flags = ~used_bits.vx_flags, + .vx_vni = ~used_bits.vx_vni, + }; } if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { @@ -4440,6 +4542,8 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(0) + /* IFLA_VXLAN_GPE */ nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ + /* IFLA_VXLAN_RESERVED_BITS */ + nla_total_size(sizeof(struct vxlanhdr)) + 0; } @@ -4542,6 +4646,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) goto nla_put_failure; + if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS, + sizeof(vxlan->cfg.reserved_bits), + &vxlan->cfg.reserved_bits)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -4552,7 +4661,7 @@ static struct net *vxlan_get_link_net(const struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - return vxlan->net; + return READ_ONCE(vxlan->net); } static struct rtnl_link_ops vxlan_link_ops __read_mostly = { @@ -4826,55 +4935,43 @@ static __net_init int vxlan_init_net(struct net *net) NULL); } -static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit vxlan_destroy_tunnels(struct vxlan_net *vn, + struct list_head *dev_to_kill) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan, *next; - struct net_device *dev, *aux; - - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &vxlan_link_ops) - unregister_netdevice_queue(dev, head); - - list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { - /* If vxlan->dev is in the same netns, it has already been added - * to the list by the previous loop. - */ - if (!net_eq(dev_net(vxlan->dev), net)) - unregister_netdevice_queue(vxlan->dev, head); - } + list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) + vxlan_dellink(vxlan->dev, dev_to_kill); } -static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) +static void __net_exit vxlan_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct net *net; - LIST_HEAD(list); - unsigned int h; + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); - unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); - } - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - vxlan_destroy_tunnels(net, &list); + __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); - unregister_netdevice_many(&list); - rtnl_unlock(); + vxlan_destroy_tunnels(vn, dev_to_kill); + } +} - list_for_each_entry(net, net_list, exit_list) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); +static void __net_exit vxlan_exit_net(struct net *net) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + unsigned int h; - for (h = 0; h < PORT_HASH_SIZE; ++h) - WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); - } + for (h = 0; h < PORT_HASH_SIZE; ++h) + WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); } static struct pernet_operations vxlan_net_ops = { .init = vxlan_init_net, - .exit_batch = vxlan_exit_batch_net, + .exit_batch_rtnl = vxlan_exit_batch_rtnl, + .exit = vxlan_exit_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), }; @@ -4901,9 +4998,13 @@ static int __init vxlan_init_module(void) if (rc) goto out4; - vxlan_vnifilter_init(); + rc = vxlan_vnifilter_init(); + if (rc) + goto out5; return 0; +out5: + rtnl_link_unregister(&vxlan_link_ops); out4: unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); out3: diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c index 60eb95a06d55..816ab1aa0526 100644 --- a/drivers/net/vxlan/vxlan_mdb.c +++ b/drivers/net/vxlan/vxlan_mdb.c @@ -284,7 +284,7 @@ int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb, ASSERT_RTNL(); - NL_ASSERT_DUMP_CTX_FITS(struct vxlan_mdb_dump_ctx); + NL_ASSERT_CTX_FITS(struct vxlan_mdb_dump_ctx); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWMDB, sizeof(*bpm), @@ -1712,7 +1712,7 @@ netdev_tx_t vxlan_mdb_xmit(struct vxlan_dev *vxlan, vxlan_xmit_one(skb, vxlan->dev, src_vni, rcu_dereference(fremote->rd), false); else - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); return NETDEV_TX_OK; } diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index b35d96b78843..76a351a997d5 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -202,7 +202,7 @@ int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, int vxlan_vnigroup_init(struct vxlan_dev *vxlan); void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan); -void vxlan_vnifilter_init(void); +int vxlan_vnifilter_init(void); void vxlan_vnifilter_uninit(void); void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni, struct vxlan_vni_node *vninode, diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index 9c59d0bf8c3d..6e6e9f05509a 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -411,6 +411,11 @@ static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb struct tunnel_msg *tmsg; struct net_device *dev; + if (cb->nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct tunnel_msg))) { + NL_SET_ERR_MSG(cb->extack, "Invalid msg length"); + return -EINVAL; + } + tmsg = nlmsg_data(cb->nlh); if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) { @@ -992,19 +997,18 @@ static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -void vxlan_vnifilter_init(void) +static const struct rtnl_msg_handler vxlan_vnifilter_rtnl_msg_handlers[] = { + {THIS_MODULE, PF_BRIDGE, RTM_GETTUNNEL, NULL, vxlan_vnifilter_dump, 0}, + {THIS_MODULE, PF_BRIDGE, RTM_NEWTUNNEL, vxlan_vnifilter_process, NULL, 0}, + {THIS_MODULE, PF_BRIDGE, RTM_DELTUNNEL, vxlan_vnifilter_process, NULL, 0}, +}; + +int vxlan_vnifilter_init(void) { - rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETTUNNEL, NULL, - vxlan_vnifilter_dump, 0); - rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWTUNNEL, - vxlan_vnifilter_process, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELTUNNEL, - vxlan_vnifilter_process, NULL, 0); + return rtnl_register_many(vxlan_vnifilter_rtnl_msg_handlers); } void vxlan_vnifilter_uninit(void) { - rtnl_unregister(PF_BRIDGE, RTM_GETTUNNEL); - rtnl_unregister(PF_BRIDGE, RTM_NEWTUNNEL); - rtnl_unregister(PF_BRIDGE, RTM_DELTUNNEL); + rtnl_unregister_many(vxlan_vnifilter_rtnl_msg_handlers); } |