diff options
Diffstat (limited to 'drivers/net/vrf.c')
| -rw-r--r-- | drivers/net/vrf.c | 959 |
1 files changed, 752 insertions, 207 deletions
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 95909e262ba4..571847a7f86d 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * @@ -6,13 +7,9 @@ * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ +#include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> @@ -25,9 +22,11 @@ #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> +#include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> +#include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> @@ -36,15 +35,82 @@ #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> +#include <net/netdev_lock.h> +#include <net/sch_generic.h> #include <net/netns/generic.h> +#include <net/netfilter/nf_conntrack.h> #define DRV_NAME "vrf" -#define DRV_VERSION "1.0" +#define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ +#define HT_MAP_BITS 4 +#define HASH_INITVAL ((u32)0xcafef00d) + +struct vrf_map { + DECLARE_HASHTABLE(ht, HT_MAP_BITS); + spinlock_t vmap_lock; + + /* shared_tables: + * count how many distinct tables do not comply with the strict mode + * requirement. + * shared_tables value must be 0 in order to enable the strict mode. + * + * example of the evolution of shared_tables: + * | time + * add vrf0 --> table 100 shared_tables = 0 | t0 + * add vrf1 --> table 101 shared_tables = 0 | t1 + * add vrf2 --> table 100 shared_tables = 1 | t2 + * add vrf3 --> table 100 shared_tables = 1 | t3 + * add vrf4 --> table 101 shared_tables = 2 v t4 + * + * shared_tables is a "step function" (or "staircase function") + * and it is increased by one when the second vrf is associated to a + * table. + * + * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. + * + * at t3, another dev (vrf3) is bound to the same table 100 but the + * value of shared_tables is still 1. + * This means that no matter how many new vrfs will register on the + * table 100, the shared_tables will not increase (considering only + * table 100). + * + * at t4, vrf4 is bound to table 101, and shared_tables = 2. + * + * Looking at the value of shared_tables we can immediately know if + * the strict_mode can or cannot be enforced. Indeed, strict_mode + * can be enforced iff shared_tables = 0. + * + * Conversely, shared_tables is decreased when a vrf is de-associated + * from a table with exactly two associated vrfs. + */ + u32 shared_tables; + + bool strict_mode; +}; + +struct vrf_map_elem { + struct hlist_node hnode; + struct list_head vrf_list; /* VRFs registered to this table */ + + u32 table_id; + int users; + int ifindex; +}; + static unsigned int vrf_net_id; +/* per netns vrf data */ +struct netns_vrf { + /* protected by rtnl lock */ + bool add_fib_rules; + + struct vrf_map vmap; + struct ctl_table_header *ctl_hdr; +}; + struct net_vrf { struct rtable __rcu *rth; struct rt6_info __rcu *rt6; @@ -52,59 +118,223 @@ struct net_vrf { struct fib6_table *fib6_table; #endif u32 tb_id; -}; -struct pcpu_dstats { - u64 tx_pkts; - u64 tx_bytes; - u64 tx_drps; - u64 rx_pkts; - u64 rx_bytes; - u64 rx_drps; - struct u64_stats_sync syncp; + struct list_head me_list; /* entry in vrf_map_elem */ + int ifindex; }; -static void vrf_rx_stats(struct net_device *dev, int len) +static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { - struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + vrf_dev->stats.tx_errors++; + kfree_skb(skb); +} + +static struct vrf_map *netns_vrf_map(struct net *net) +{ + struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); - u64_stats_update_begin(&dstats->syncp); - dstats->rx_pkts++; - dstats->rx_bytes += len; - u64_stats_update_end(&dstats->syncp); + return &nn_vrf->vmap; } -static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) +static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { - vrf_dev->stats.tx_errors++; - kfree_skb(skb); + return netns_vrf_map(dev_net(dev)); +} + +static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) +{ + struct list_head *me_head = &me->vrf_list; + struct net_vrf *vrf; + + if (list_empty(me_head)) + return -ENODEV; + + vrf = list_first_entry(me_head, struct net_vrf, me_list); + + return vrf->ifindex; +} + +static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) +{ + struct vrf_map_elem *me; + + me = kmalloc(sizeof(*me), flags); + if (!me) + return NULL; + + return me; +} + +static void vrf_map_elem_free(struct vrf_map_elem *me) +{ + kfree(me); +} + +static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, + int ifindex, int users) +{ + me->table_id = table_id; + me->ifindex = ifindex; + me->users = users; + INIT_LIST_HEAD(&me->vrf_list); +} + +static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, + u32 table_id) +{ + struct vrf_map_elem *me; + u32 key; + + key = jhash_1word(table_id, HASH_INITVAL); + hash_for_each_possible(vmap->ht, me, hnode, key) { + if (me->table_id == table_id) + return me; + } + + return NULL; +} + +static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) +{ + u32 table_id = me->table_id; + u32 key; + + key = jhash_1word(table_id, HASH_INITVAL); + hash_add(vmap->ht, &me->hnode, key); +} + +static void vrf_map_del_elem(struct vrf_map_elem *me) +{ + hash_del(&me->hnode); +} + +static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) +{ + spin_lock(&vmap->vmap_lock); +} + +static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) +{ + spin_unlock(&vmap->vmap_lock); } -static void vrf_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) -{ - int i; - - for_each_possible_cpu(i) { - const struct pcpu_dstats *dstats; - u64 tbytes, tpkts, tdrops, rbytes, rpkts; - unsigned int start; - - dstats = per_cpu_ptr(dev->dstats, i); - do { - start = u64_stats_fetch_begin_irq(&dstats->syncp); - tbytes = dstats->tx_bytes; - tpkts = dstats->tx_pkts; - tdrops = dstats->tx_drps; - rbytes = dstats->rx_bytes; - rpkts = dstats->rx_pkts; - } while (u64_stats_fetch_retry_irq(&dstats->syncp, start)); - stats->tx_bytes += tbytes; - stats->tx_packets += tpkts; - stats->tx_dropped += tdrops; - stats->rx_bytes += rbytes; - stats->rx_packets += rpkts; +/* called with rtnl lock held */ +static int +vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) +{ + struct vrf_map *vmap = netns_vrf_map_by_dev(dev); + struct net_vrf *vrf = netdev_priv(dev); + struct vrf_map_elem *new_me, *me; + u32 table_id = vrf->tb_id; + bool free_new_me = false; + int users; + int res; + + /* we pre-allocate elements used in the spin-locked section (so that we + * keep the spinlock as short as possible). + */ + new_me = vrf_map_elem_alloc(GFP_KERNEL); + if (!new_me) + return -ENOMEM; + + vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); + + vrf_map_lock(vmap); + + me = vrf_map_lookup_elem(vmap, table_id); + if (!me) { + me = new_me; + vrf_map_add_elem(vmap, me); + goto link_vrf; } + + /* we already have an entry in the vrf_map, so it means there is (at + * least) a vrf registered on the specific table. + */ + free_new_me = true; + if (vmap->strict_mode) { + /* vrfs cannot share the same table */ + NL_SET_ERR_MSG(extack, "Table is used by another VRF"); + res = -EBUSY; + goto unlock; + } + +link_vrf: + users = ++me->users; + if (users == 2) + ++vmap->shared_tables; + + list_add(&vrf->me_list, &me->vrf_list); + + res = 0; + +unlock: + vrf_map_unlock(vmap); + + /* clean-up, if needed */ + if (free_new_me) + vrf_map_elem_free(new_me); + + return res; +} + +/* called with rtnl lock held */ +static void vrf_map_unregister_dev(struct net_device *dev) +{ + struct vrf_map *vmap = netns_vrf_map_by_dev(dev); + struct net_vrf *vrf = netdev_priv(dev); + u32 table_id = vrf->tb_id; + struct vrf_map_elem *me; + int users; + + vrf_map_lock(vmap); + + me = vrf_map_lookup_elem(vmap, table_id); + if (!me) + goto unlock; + + list_del(&vrf->me_list); + + users = --me->users; + if (users == 1) { + --vmap->shared_tables; + } else if (users == 0) { + vrf_map_del_elem(me); + + /* no one will refer to this element anymore */ + vrf_map_elem_free(me); + } + +unlock: + vrf_map_unlock(vmap); +} + +/* return the vrf device index associated with the table_id */ +static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) +{ + struct vrf_map *vmap = netns_vrf_map(net); + struct vrf_map_elem *me; + int ifindex; + + vrf_map_lock(vmap); + + if (!vmap->strict_mode) { + ifindex = -EPERM; + goto unlock; + } + + me = vrf_map_lookup_elem(vmap, table_id); + if (!me) { + ifindex = -ENODEV; + goto unlock; + } + + ifindex = vrf_map_elem_get_vrf_ifindex(me); + +unlock: + vrf_map_unlock(vmap); + + return ifindex; } /* by default VRF devices do not have a qdisc and are expected @@ -113,15 +343,13 @@ static void vrf_get_stats64(struct net_device *dev, static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; - struct Qdisc *qdisc; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); - qdisc = rcu_access_pointer(txq->qdisc); - return !qdisc->enqueue; + return qdisc_txq_has_no_queue(txq); } /* Local traffic destined to local address. Reinsert the packet to rx @@ -130,7 +358,7 @@ static bool qdisc_tx_is_default(const struct net_device *dev) static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { - int len = skb->len; + unsigned int len = skb->len; skb_orphan(skb); @@ -143,20 +371,34 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, skb->protocol = eth_type_trans(skb, dev); - if (likely(netif_rx(skb) == NET_RX_SUCCESS)) - vrf_rx_stats(dev, len); + if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) + dev_dstats_rx_add(dev, len); else - this_cpu_inc(dev->dstats->rx_drps); + dev_dstats_rx_dropped(dev); return NETDEV_TX_OK; } +static void vrf_nf_set_untracked(struct sk_buff *skb) +{ + if (skb_get_nfct(skb) == 0) + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); +} + +static void vrf_nf_reset_ct(struct sk_buff *skb) +{ + if (skb_get_nfct(skb) == IP_CT_UNTRACKED) + nf_reset_ct(skb); +} + #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; + vrf_nf_reset_ct(skb); + err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); @@ -169,32 +411,36 @@ static int vrf_ip6_local_out(struct net *net, struct sock *sk, static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { - const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); - struct flowi6 fl6 = { - /* needed to match OIF rule */ - .flowi6_oif = dev->ifindex, - .flowi6_iif = LOOPBACK_IFINDEX, - .daddr = iph->daddr, - .saddr = iph->saddr, - .flowlabel = ip6_flowinfo(iph), - .flowi6_mark = skb->mark, - .flowi6_proto = iph->nexthdr, - .flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF, - }; + struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; - dst = ip6_route_output(net, NULL, &fl6); - if (dst == dst_null) + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) + goto err; + + iph = ipv6_hdr(skb); + + memset(&fl6, 0, sizeof(fl6)); + /* needed to match OIF rule */ + fl6.flowi6_l3mdev = dev->ifindex; + fl6.flowi6_iif = LOOPBACK_IFINDEX; + fl6.daddr = iph->daddr; + fl6.saddr = iph->saddr; + fl6.flowlabel = ip6_flowinfo(iph); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = iph->nexthdr; + + dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst) || dst == dst_null) goto err; skb_dst_drop(skb); - /* if dst.dev is loopback or the VRF device again this is locally - * originated traffic destined to a local address. Short circuit - * to Rx path + /* if dst.dev is the VRF device again this is locally originated traffic + * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); @@ -204,6 +450,7 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; @@ -230,6 +477,8 @@ static int vrf_ip_local_out(struct net *net, struct sock *sk, { int err; + vrf_nf_reset_ct(skb); + err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) @@ -241,30 +490,35 @@ static int vrf_ip_local_out(struct net *net, struct sock *sk, static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { - struct iphdr *ip4h = ip_hdr(skb); + struct iphdr *ip4h; int ret = NET_XMIT_DROP; - struct flowi4 fl4 = { - /* needed to match OIF rule */ - .flowi4_oif = vrf_dev->ifindex, - .flowi4_iif = LOOPBACK_IFINDEX, - .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF, - .flowi4_proto = ip4h->protocol, - .daddr = ip4h->daddr, - .saddr = ip4h->saddr, - }; + struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) + goto err; + + ip4h = ip_hdr(skb); + + memset(&fl4, 0, sizeof(fl4)); + /* needed to match OIF rule */ + fl4.flowi4_l3mdev = vrf_dev->ifindex; + fl4.flowi4_iif = LOOPBACK_IFINDEX; + fl4.flowi4_dscp = ip4h_dscp(ip4h); + fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; + fl4.flowi4_proto = ip4h->protocol; + fl4.daddr = ip4h->daddr; + fl4.saddr = ip4h->saddr; + rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; skb_dst_drop(skb); - /* if dst.dev is loopback or the VRF device again this is locally - * originated traffic destined to a local address. Short circuit - * to Rx path + /* if dst.dev is the VRF device again this is locally originated traffic + * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); @@ -279,6 +533,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, RT_SCOPE_LINK); } + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; @@ -307,25 +562,19 @@ static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { - int len = skb->len; - netdev_tx_t ret = is_ip_tx_frame(skb, dev); - - if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { - struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + unsigned int len = skb->len; + netdev_tx_t ret; - u64_stats_update_begin(&dstats->syncp); - dstats->tx_pkts++; - dstats->tx_bytes += len; - u64_stats_update_end(&dstats->syncp); - } else { - this_cpu_inc(dev->dstats->tx_drps); - } + ret = is_ip_tx_frame(skb, dev); + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) + dev_dstats_tx_add(dev, len); + else + dev_dstats_tx_dropped(dev); return ret; } -static int vrf_finish_direct(struct net *net, struct sock *sk, - struct sk_buff *skb) +static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; @@ -344,7 +593,7 @@ static int vrf_finish_direct(struct net *net, struct sock *sk, skb_pull(skb, ETH_HLEN); } - return 1; + vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) @@ -354,27 +603,27 @@ static int vrf_finish_output6(struct net *net, struct sock *sk, { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; + const struct in6_addr *nexthop; struct neighbour *neigh; - struct in6_addr *nexthop; int ret; - nf_reset(skb); + vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; - rcu_read_lock_bh(); - nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); + rcu_read_lock(); + nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); - ret = neigh_output(neigh, skb); - rcu_read_unlock_bh(); + ret = neigh_output(neigh, skb, false); + rcu_read_unlock(); return ret; } - rcu_read_unlock_bh(); + rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); @@ -423,15 +672,41 @@ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, return skb; } +static int vrf_output6_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + vrf_finish_direct(skb); + + return vrf_ip6_local_out(net, sk, skb); +} + static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { + int err = 1; + skb->protocol = htons(ETH_P_IPV6); - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb->dev, - vrf_finish_direct, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + if (!(IPCB(skb)->flags & IPSKB_REROUTED)) + err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, + NULL, skb->dev, vrf_output6_direct_finish); + + if (likely(err == 1)) + vrf_finish_direct(skb); + + return err; +} + +static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int err; + + err = vrf_output6_direct(net, sk, skb); + if (likely(err == 1)) + err = vrf_ip6_local_out(net, sk, skb); + + return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, @@ -444,18 +719,15 @@ static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, - skb, NULL, vrf_dev, vrf_output6_direct); + skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); - /* reset skb device */ if (likely(err == 1)) - nf_reset(skb); - else - skb = NULL; + return skb; - return skb; + return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, @@ -466,7 +738,10 @@ static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) return skb; - if (qdisc_tx_is_default(vrf_dev)) + vrf_nf_set_untracked(skb); + + if (qdisc_tx_is_default(vrf_dev) || + IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return vrf_ip6_out_direct(vrf_dev, sk, skb); return vrf_ip6_out_redirect(vrf_dev, skb); @@ -487,16 +762,16 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) */ if (rt6) { dst = &rt6->dst; - dev_put(dst->dev); + netdev_ref_replace(dst->dev, net->loopback_dev, + &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; - dev_hold(dst->dev); dst_release(dst); } } static int vrf_rt6_create(struct net_device *dev) { - int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM; + int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; @@ -545,48 +820,39 @@ static int vrf_rt6_create(struct net_device *dev) static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; - u32 nexthop; - int ret = -EINVAL; + bool is_v6gw = false; - nf_reset(skb); + vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); - if (!skb2) { - ret = -ENOMEM; - goto err; + skb = skb_expand_head(skb, hh_len); + if (!skb) { + dev->stats.tx_errors++; + return -ENOMEM; } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - - consume_skb(skb); - skb = skb2; } - rcu_read_lock_bh(); + rcu_read_lock(); - nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); - neigh = __ipv4_neigh_lookup_noref(dev, nexthop); - if (unlikely(!neigh)) - neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { + int ret; + sock_confirm_neigh(skb, neigh); - ret = neigh_output(neigh, skb); - rcu_read_unlock_bh(); + /* if crossing protocols, can not use the cached header */ + ret = neigh_output(neigh, skb, is_v6gw); + rcu_read_unlock(); return ret; } - rcu_read_unlock_bh(); -err: + rcu_read_unlock(); vrf_tx_error(skb->dev, skb); - return ret; + return -EINVAL; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -636,15 +902,41 @@ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, return skb; } +static int vrf_output_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + vrf_finish_direct(skb); + + return vrf_ip_local_out(net, sk, skb); +} + static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { + int err = 1; + skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb->dev, - vrf_finish_direct, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + if (!(IPCB(skb)->flags & IPSKB_REROUTED)) + err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, + NULL, skb->dev, vrf_output_direct_finish); + + if (likely(err == 1)) + vrf_finish_direct(skb); + + return err; +} + +static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int err; + + err = vrf_output_direct(net, sk, skb); + if (likely(err == 1)) + err = vrf_ip_local_out(net, sk, skb); + + return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, @@ -657,18 +949,15 @@ static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, - skb, NULL, vrf_dev, vrf_output_direct); + skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); - /* reset skb device */ if (likely(err == 1)) - nf_reset(skb); - else - skb = NULL; + return skb; - return skb; + return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, @@ -680,7 +969,10 @@ static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; - if (qdisc_tx_is_default(vrf_dev)) + vrf_nf_set_untracked(skb); + + if (qdisc_tx_is_default(vrf_dev) || + IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return vrf_ip_out_direct(vrf_dev, sk, skb); return vrf_ip_out_redirect(vrf_dev, skb); @@ -717,9 +1009,9 @@ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) */ if (rth) { dst = &rth->dst; - dev_put(dst->dev); + netdev_ref_replace(dst->dev, net->loopback_dev, + &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; - dev_hold(dst->dev); dst_release(dst); } } @@ -733,7 +1025,7 @@ static int vrf_rtable_create(struct net_device *dev) return -ENOMEM; /* create a dst for routing packets out through a VRF device */ - rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0); + rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1); if (!rth) return -ENOMEM; @@ -832,31 +1124,21 @@ static void vrf_dev_uninit(struct net_device *dev) vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); - - free_percpu(dev->dstats); - dev->dstats = NULL; } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); - dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); - if (!dev->dstats) - goto out_nomem; - /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) - goto out_stats; + goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; dev->flags = IFF_MASTER | IFF_NOARP; - /* MTU is irrelevant for VRF device; set to 64k similar to lo */ - dev->mtu = 64 * 1024; - /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); @@ -864,9 +1146,6 @@ static int vrf_dev_init(struct net_device *dev) out_rth: vrf_rtable_release(dev, vrf); -out_stats: - free_percpu(dev->dstats); - dev->dstats = NULL; out_nomem: return -ENOMEM; } @@ -875,7 +1154,7 @@ static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, - .ndo_get_stats64 = vrf_get_stats64, + .ndo_set_mac_address = eth_mac_addr, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, }; @@ -905,6 +1184,62 @@ static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, return skb; } +static int vrf_prepare_mac_header(struct sk_buff *skb, + struct net_device *vrf_dev, u16 proto) +{ + struct ethhdr *eth; + int err; + + /* in general, we do not know if there is enough space in the head of + * the packet for hosting the mac header. + */ + err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); + if (unlikely(err)) + /* no space in the skb head */ + return -ENOBUFS; + + __skb_push(skb, ETH_HLEN); + eth = (struct ethhdr *)skb->data; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + /* we set the ethernet destination and the source addresses to the + * address of the VRF device. + */ + ether_addr_copy(eth->h_dest, vrf_dev->dev_addr); + ether_addr_copy(eth->h_source, vrf_dev->dev_addr); + eth->h_proto = htons(proto); + + /* the destination address of the Ethernet frame corresponds to the + * address set on the VRF interface; therefore, the packet is intended + * to be processed locally. + */ + skb->protocol = eth->h_proto; + skb->pkt_type = PACKET_HOST; + + skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + + skb_pull_inline(skb, ETH_HLEN); + + return 0; +} + +/* prepare and add the mac header to the packet if it was not set previously. + * In this way, packet sniffers such as tcpdump can parse the packet correctly. + * If the mac header was already set, the original mac header is left + * untouched and the function returns immediately. + */ +static int vrf_add_mac_header_if_unset(struct sk_buff *skb, + struct net_device *vrf_dev, + u16 proto, struct net_device *orig_dev) +{ + if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) + return 0; + + return vrf_prepare_mac_header(skb, vrf_dev, proto); +} + #if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails @@ -967,6 +1302,8 @@ static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; + skb_dst_drop(skb); + rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) @@ -986,27 +1323,42 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, bool is_ndisc = ipv6_ndisc_frame(skb); /* loopback, multicast & non-ND link-local traffic; do not push through - * packet taps again. Reset pkt_type for upper layers to process skb + * packet taps again. Reset pkt_type for upper layers to process skb. + * For non-loopback strict packets, determine the dst using the original + * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; + if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; + else + vrf_ip6_input_dst(skb, vrf_dev, orig_iif); + goto out; } /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { - vrf_rx_stats(vrf_dev, skb->len); + struct net_device *orig_dev = skb->dev; + + dev_dstats_rx_add(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; if (!list_empty(&vrf_dev->ptype_all)) { - skb_push(skb, skb->mac_len); - dev_queue_xmit_nit(skb, vrf_dev); - skb_pull(skb, skb->mac_len); + int err; + + err = vrf_add_mac_header_if_unset(skb, vrf_dev, + ETH_P_IPV6, + orig_dev); + if (likely(!err)) { + skb_push(skb, skb->mac_len); + dev_queue_xmit_nit(skb, vrf_dev); + skb_pull(skb, skb->mac_len); + } } IP6CB(skb)->flags |= IP6SKB_L3SLAVE; @@ -1031,6 +1383,8 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { + struct net_device *orig_dev = skb->dev; + skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; @@ -1046,12 +1400,18 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, goto out; } - vrf_rx_stats(vrf_dev, skb->len); + dev_dstats_rx_add(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { - skb_push(skb, skb->mac_len); - dev_queue_xmit_nit(skb, vrf_dev); - skb_pull(skb, skb->mac_len); + int err; + + err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, + orig_dev); + if (likely(!err)) { + skb_push(skb, skb->mac_len); + dev_queue_xmit_nit(skb, vrf_dev); + skb_pull(skb, skb->mac_len); + } } skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); @@ -1077,12 +1437,14 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct + * Note: Caller to this function must hold rcu_read_lock() and no refcnt + * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); - int flags = RT6_LOOKUP_F_IFACE; + int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; @@ -1092,7 +1454,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; - dst_hold(dst); return dst; } @@ -1119,8 +1480,8 @@ static const struct l3mdev_ops vrf_l3mdev_ops = { static void vrf_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); - strlcpy(info->version, DRV_VERSION, sizeof(info->version)); + strscpy(info->driver, DRV_NAME, sizeof(info->driver)); + strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static const struct ethtool_ops vrf_ethtool_ops = { @@ -1146,7 +1507,8 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) struct sk_buff *skb; int err; - if (family == AF_INET6 && !ipv6_mod_enabled()) + if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && + !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); @@ -1176,14 +1538,12 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) nlmsg_end(skb, nlh); - /* fib_nl_{new,del}rule handling looks for net from skb->sk */ - skb->sk = dev_net(dev)->rtnl; if (add_it) { - err = fib_nl_newrule(skb, nlh, NULL); + err = fib_newrule(dev_net(dev), skb, nlh, NULL, true); if (err == -EEXIST) err = 0; } else { - err = fib_nl_delrule(skb, nlh, NULL); + err = fib_delrule(dev_net(dev), skb, nlh, NULL, true); if (err == -ENOENT) err = 0; } @@ -1255,10 +1615,10 @@ static void vrf_setup(struct net_device *dev) eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ - dev->features |= NETIF_F_LLTX; + dev->lltx = true; /* don't allow vrf devices to change network namespaces. */ - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_immutable = true; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; @@ -1273,6 +1633,18 @@ static void vrf_setup(struct net_device *dev) /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; + dev->priv_flags |= IFF_NO_RX_HANDLER; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + + /* VRF devices do not care about MTU, but if the MTU is set + * too low then the ipv4 and ipv6 protocols are disabled + * which breaks networking. + */ + dev->min_mtu = IPV6_MIN_MTU; + dev->max_mtu = IP6_MAX_MTU; + dev->mtu = dev->max_mtu; + + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1299,14 +1671,18 @@ static void vrf_dellink(struct net_device *dev, struct list_head *head) netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); + vrf_map_unregister_dev(dev); + unregister_netdevice_queue(dev, head); } -static int vrf_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vrf_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_vrf *vrf = netdev_priv(dev); + struct nlattr **data = params->data; + struct netns_vrf *nn_vrf; bool *add_fib_rules; struct net *net; int err; @@ -1329,11 +1705,26 @@ static int vrf_newlink(struct net *src_net, struct net_device *dev, if (err) goto out; + /* mapping between table_id and vrf; + * note: such binding could not be done in the dev init function + * because dev->ifindex id is not available yet. + */ + vrf->ifindex = dev->ifindex; + + err = vrf_map_register_dev(dev, extack); + if (err) { + unregister_netdevice(dev); + goto out; + } + net = dev_net(dev); - add_fib_rules = net_generic(net, vrf_net_id); + nn_vrf = net_generic(net, vrf_net_id); + + add_fib_rules = &nn_vrf->add_fib_rules; if (*add_fib_rules) { err = vrf_add_fib_rules(dev); if (err) { + vrf_map_unregister_dev(dev); unregister_netdevice(dev); goto out; } @@ -1420,20 +1811,164 @@ static struct notifier_block vrf_notifier_block __read_mostly = { .notifier_call = vrf_device_event, }; +static int vrf_map_init(struct vrf_map *vmap) +{ + spin_lock_init(&vmap->vmap_lock); + hash_init(vmap->ht); + + vmap->strict_mode = false; + + return 0; +} + +#ifdef CONFIG_SYSCTL +static bool vrf_strict_mode(struct vrf_map *vmap) +{ + bool strict_mode; + + vrf_map_lock(vmap); + strict_mode = vmap->strict_mode; + vrf_map_unlock(vmap); + + return strict_mode; +} + +static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) +{ + bool *cur_mode; + int res = 0; + + vrf_map_lock(vmap); + + cur_mode = &vmap->strict_mode; + if (*cur_mode == new_mode) + goto unlock; + + if (*cur_mode) { + /* disable strict mode */ + *cur_mode = false; + } else { + if (vmap->shared_tables) { + /* we cannot allow strict_mode because there are some + * vrfs that share one or more tables. + */ + res = -EBUSY; + goto unlock; + } + + /* no tables are shared among vrfs, so we can go back + * to 1:1 association between a vrf with its table. + */ + *cur_mode = true; + } + +unlock: + vrf_map_unlock(vmap); + + return res; +} + +static int vrf_shared_table_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = (struct net *)table->extra1; + struct vrf_map *vmap = netns_vrf_map(net); + int proc_strict_mode = 0; + struct ctl_table tmp = { + .procname = table->procname, + .data = &proc_strict_mode, + .maxlen = sizeof(int), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + int ret; + + if (!write) + proc_strict_mode = vrf_strict_mode(vmap); + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) + ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); + + return ret; +} + +static const struct ctl_table vrf_table[] = { + { + .procname = "strict_mode", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = vrf_shared_table_handler, + /* set by the vrf_netns_init */ + .extra1 = NULL, + }, +}; + +static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) +{ + struct ctl_table *table; + + table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + /* init the extra1 parameter with the reference to current netns */ + table[0].extra1 = net; + + nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table, + ARRAY_SIZE(vrf_table)); + if (!nn_vrf->ctl_hdr) { + kfree(table); + return -ENOMEM; + } + + return 0; +} + +static void vrf_netns_exit_sysctl(struct net *net) +{ + struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); + const struct ctl_table *table; + + table = nn_vrf->ctl_hdr->ctl_table_arg; + unregister_net_sysctl_table(nn_vrf->ctl_hdr); + kfree(table); +} +#else +static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) +{ + return 0; +} + +static void vrf_netns_exit_sysctl(struct net *net) +{ +} +#endif + /* Initialize per network namespace state */ static int __net_init vrf_netns_init(struct net *net) { - bool *add_fib_rules = net_generic(net, vrf_net_id); + struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); - *add_fib_rules = true; + nn_vrf->add_fib_rules = true; + vrf_map_init(&nn_vrf->vmap); - return 0; + return vrf_netns_init_sysctl(net, nn_vrf); +} + +static void __net_exit vrf_netns_exit(struct net *net) +{ + vrf_netns_exit_sysctl(net); } static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, + .exit = vrf_netns_exit, .id = &vrf_net_id, - .size = sizeof(bool), + .size = sizeof(struct netns_vrf), }; static int __init vrf_init_module(void) @@ -1446,14 +1981,24 @@ static int __init vrf_init_module(void) if (rc < 0) goto error; + rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, + vrf_ifindex_lookup_by_table_id); + if (rc < 0) + goto unreg_pernet; + rc = rtnl_link_register(&vrf_link_ops); - if (rc < 0) { - unregister_pernet_subsys(&vrf_net_ops); - goto error; - } + if (rc < 0) + goto table_lookup_unreg; return 0; +table_lookup_unreg: + l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, + vrf_ifindex_lookup_by_table_id); + +unreg_pernet: + unregister_pernet_subsys(&vrf_net_ops); + error: unregister_netdevice_notifier(&vrf_notifier_block); return rc; |
