diff options
Diffstat (limited to 'net/ipv4/ip_gre.c')
| -rw-r--r-- | net/ipv4/ip_gre.c | 1349 |
1 files changed, 1186 insertions, 163 deletions
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1f6eab66f7ce..761a53c6a89a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: GRE over IP protocol decoder. * * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -17,14 +12,14 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> -#include <linux/mroute.h> +#include <linux/if_vlan.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> @@ -33,6 +28,7 @@ #include <linux/etherdevice.h> #include <linux/if_ether.h> +#include <net/flow.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> @@ -47,12 +43,8 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/gre.h> - -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ipv6.h> -#include <net/ip6_fib.h> -#include <net/ip6_route.h> -#endif +#include <net/dst_metadata.h> +#include <net/erspan.h> /* Problems & solutions @@ -116,10 +108,16 @@ module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; +static const struct header_ops ipgre_header_ops; + static int ipgre_tunnel_init(struct net_device *dev); +static void erspan_build_header(struct sk_buff *skb, + u32 id, u32 index, + bool truncate, bool is_ipv4); -static int ipgre_net_id __read_mostly; -static int gre_tap_net_id __read_mostly; +static unsigned int ipgre_net_id __read_mostly; +static unsigned int gre_tap_net_id __read_mostly; +static unsigned int erspan_net_id __read_mostly; static int ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) @@ -145,17 +143,32 @@ static int ipgre_err(struct sk_buff *skb, u32 info, const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else if (tpi->proto == htons(ETH_P_ERSPAN) || + tpi->proto == htons(ETH_P_ERSPAN2)) + itn = net_generic(net, erspan_net_id); + else + itn = net_generic(net, ipgre_net_id); + + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); + t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, + iph->daddr, iph->saddr, tpi->key); + + if (!t) + return -ENOENT; + switch (type) { default: case ICMP_PARAMETERPROB: - return PACKET_RCVD; + return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return PACKET_RCVD; + return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -164,63 +177,297 @@ static int ipgre_err(struct sk_buff *skb, u32 info, break; } break; + case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return PACKET_RCVD; + return 0; break; case ICMP_REDIRECT: break; } - if (tpi->proto == htons(ETH_P_TEB)) - itn = net_generic(net, gre_tap_net_id); - else - itn = net_generic(net, ipgre_net_id); +#if IS_ENABLED(CONFIG_IPV6) + if (tpi->proto == htons(ETH_P_IPV6)) { + unsigned int data_len = 0; - iph = (const struct iphdr *)skb->data; - t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, - iph->daddr, iph->saddr, tpi->key); + if (type == ICMP_TIME_EXCEEDED) + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ - if (t == NULL) - return PACKET_REJECT; + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return 0; + } +#endif if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - return PACKET_RCVD; + return 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - return PACKET_RCVD; + return 0; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; - return PACKET_RCVD; + + return 0; } -static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) +static void gre_err(struct sk_buff *skb, u32 info) +{ + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + * + * Moreover, Cisco "wise men" put GRE key to the third word + * in GRE header. It makes impossible maintaining even soft + * state for keyed + * GRE tunnels with enabled checksum. Tell them "thank you". + * + * Well, I wonder, rfc1812 was written by Cisco employee, + * what the hell these idiots break standards established + * by themselves??? + */ + + const struct iphdr *iph = (struct iphdr *)skb->data; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct tnl_ptk_info tpi; + + if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP), + iph->ihl * 4) < 0) + return; + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + skb->dev->ifindex, IPPROTO_GRE); + return; + } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, + IPPROTO_GRE); + return; + } + + ipgre_err(skb, info, &tpi); +} + +static bool is_erspan_type1(int gre_hdr_len) +{ + /* Both ERSPAN type I (version 0) and type II (version 1) use + * protocol 0x88BE, but the type I has only 4-byte GRE header, + * while type II has 8-byte. + */ + return gre_hdr_len == 4; +} + +static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, + int gre_hdr_len) { struct net *net = dev_net(skb->dev); + struct metadata_dst *tun_dst = NULL; + struct erspan_base_hdr *ershdr; + IP_TUNNEL_DECLARE_FLAGS(flags); struct ip_tunnel_net *itn; - const struct iphdr *iph; struct ip_tunnel *tunnel; + const struct iphdr *iph; + struct erspan_md2 *md2; + int ver; + int len; - if (tpi->proto == htons(ETH_P_TEB)) - itn = net_generic(net, gre_tap_net_id); - else - itn = net_generic(net, ipgre_net_id); + ip_tunnel_flags_copy(flags, tpi->flags); + + itn = net_generic(net, erspan_net_id); + iph = ip_hdr(skb); + if (is_erspan_type1(gre_hdr_len)) { + ver = 0; + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, + iph->saddr, iph->daddr, 0); + } else { + if (unlikely(!pskb_may_pull(skb, + gre_hdr_len + sizeof(*ershdr)))) + return PACKET_REJECT; + + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = ershdr->ver; + iph = ip_hdr(skb); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, + iph->saddr, iph->daddr, tpi->key); + } + + if (tunnel) { + if (is_erspan_type1(gre_hdr_len)) + len = gre_hdr_len; + else + len = gre_hdr_len + erspan_hdr_len(ver); + + if (unlikely(!pskb_may_pull(skb, len))) + return PACKET_REJECT; + + if (__iptunnel_pull_header(skb, + len, + htons(ETH_P_TEB), + false, false) < 0) + goto drop; + + if (tunnel->collect_md) { + struct erspan_metadata *pkt_md, *md; + struct ip_tunnel_info *info; + unsigned char *gh; + __be64 tun_id; + + __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); + ip_tunnel_flags_copy(flags, tpi->flags); + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ip_tun_rx_dst(skb, flags, + tun_id, sizeof(*md)); + if (!tun_dst) + return PACKET_REJECT; + + /* skb can be uncloned in __iptunnel_pull_header, so + * old pkt_md is no longer valid and we need to reset + * it + */ + gh = skb_network_header(skb) + + skb_network_header_len(skb); + pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + + sizeof(*ershdr)); + md = ip_tunnel_info_opts(&tun_dst->u.tun_info); + md->version = ver; + md2 = &md->u.md2; + memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : + ERSPAN_V2_MDSIZE); + + info = &tun_dst->u.tun_info; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + info->key.tun_flags); + info->options_len = sizeof(*md); + } + + skb_reset_mac_header(skb); + ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + return PACKET_RCVD; + } + return PACKET_REJECT; + +drop: + kfree_skb(skb); + return PACKET_RCVD; +} + +static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) +{ + struct metadata_dst *tun_dst = NULL; + const struct iphdr *iph; + struct ip_tunnel *tunnel; iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->saddr, iph->daddr, tpi->key); if (tunnel) { - ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); + const struct iphdr *tnl_params; + + if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, + raw_proto, false) < 0) + goto drop; + + /* Special case for ipgre_header_parse(), which expects the + * mac_header to point to the outer IP header. + */ + if (tunnel->dev->header_ops == &ipgre_header_ops) + skb_pop_mac_header(skb); + else + skb_reset_mac_header(skb); + + tnl_params = &tunnel->parms.iph; + if (tunnel->collect_md || tnl_params->daddr == 0) { + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + __be64 tun_id; + + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + ip_tunnel_flags_and(flags, tpi->flags, flags); + + tun_id = key32_to_tunnel_id(tpi->key); + tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); + if (!tun_dst) + return PACKET_REJECT; + } + + ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } - return PACKET_REJECT; + return PACKET_NEXT; + +drop: + kfree_skb(skb); + return PACKET_RCVD; +} + +static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + int hdr_len) +{ + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn; + int res; + + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else + itn = net_generic(net, ipgre_net_id); + + res = __ipgre_rcv(skb, tpi, itn, hdr_len, false); + if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) { + /* ipgre tunnels in collect metadata mode should receive + * also ETH_P_TEB traffic. + */ + itn = net_generic(net, ipgre_net_id); + res = __ipgre_rcv(skb, tpi, itn, hdr_len, true); + } + return res; +} + +static int gre_rcv(struct sk_buff *skb) +{ + struct tnl_ptk_info tpi; + bool csum_err = false; + int hdr_len; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { + /* Looped back packet, drop it! */ + if (rt_is_output_route(skb_rtable(skb))) + goto drop; + } +#endif + + hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0); + if (hdr_len < 0) + goto drop; + + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { + if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) + return 0; + goto out; + } + + if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) + return 0; + +out: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: + kfree_skb(skb); + return 0; } static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, @@ -228,43 +475,209 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct tnl_ptk_info tpi; + IP_TUNNEL_DECLARE_FLAGS(flags); - tpi.flags = tunnel->parms.o_flags; - tpi.proto = proto; - tpi.key = tunnel->parms.o_key; - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; - tpi.seq = htonl(tunnel->o_seqno); + ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); /* Push GRE header. */ - gre_build_header(skb, &tpi, tunnel->hlen); + gre_build_header(skb, tunnel->tun_hlen, + flags, proto, tunnel->parms.o_key, + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } +static int gre_handle_offloads(struct sk_buff *skb, bool csum) +{ + return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); +} + +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, + __be16 proto) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + int tunnel_hlen; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto err_free_skb; + + key = &tun_info->key; + tunnel_hlen = gre_calc_hlen(key->tun_flags); + + if (skb_cow_head(skb, dev->needed_headroom)) + goto err_free_skb; + + /* Push Tunnel header. */ + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) + goto err_free_skb; + + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags); + + gre_build_header(skb, tunnel_hlen, flags, proto, + tunnel_id_to_key32(tun_info->key.tun_id), + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); + + ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); + + return; + +err_free_skb: + kfree_skb(skb); + DEV_STATS_INC(dev, tx_dropped); +} + +static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct erspan_metadata *md; + bool truncate = false; + __be16 proto; + int tunnel_hlen; + int version; + int nhoff; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto err_free_skb; + + key = &tun_info->key; + if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) + goto err_free_skb; + if (tun_info->options_len < sizeof(*md)) + goto err_free_skb; + md = ip_tunnel_info_opts(tun_info); + + /* ERSPAN has fixed 8 byte GRE header */ + version = md->version; + tunnel_hlen = 8 + erspan_hdr_len(version); + + if (skb_cow_head(skb, dev->needed_headroom)) + goto err_free_skb; + + if (gre_handle_offloads(skb, false)) + goto err_free_skb; + + if (skb->len > dev->mtu + dev->hard_header_len) { + if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) + goto err_free_skb; + truncate = true; + } + + nhoff = skb_network_offset(skb); + if (skb->protocol == htons(ETH_P_IP) && + (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) + truncate = true; + + if (skb->protocol == htons(ETH_P_IPV6)) { + int thoff; + + if (skb_transport_header_was_set(skb)) + thoff = skb_transport_offset(skb); + else + thoff = nhoff + sizeof(struct ipv6hdr); + if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) + truncate = true; + } + + if (version == 1) { + erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), + ntohl(md->u.index), truncate, true); + proto = htons(ETH_P_ERSPAN); + } else if (version == 2) { + erspan_build_header_v2(skb, + ntohl(tunnel_id_to_key32(key->tun_id)), + md->u.md2.dir, + get_hwid(&md->u.md2), + truncate, true); + proto = htons(ETH_P_ERSPAN2); + } else { + goto err_free_skb; + } + + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + gre_build_header(skb, 8, flags, proto, 0, + htonl(atomic_fetch_inc(&tunnel->o_seqno))); + + ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); + + return; + +err_free_skb: + kfree_skb(skb); + DEV_STATS_INC(dev, tx_dropped); +} + +static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct ip_tunnel_key *key; + struct rtable *rt; + struct flowi4 fl4; + + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + + key = &info->key; + ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, + tunnel_id_to_key32(key->tun_id), + key->tos & ~INET_ECN_MASK, dev_net(dev), 0, + skb->mark, skb_get_hash(skb), key->flow_flags); + rt = ip_route_output_key(dev_net(dev), &fl4); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + ip_rt_put(rt); + info->key.u.ipv4.src = fl4.saddr; + return 0; +} + static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; - skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); - if (IS_ERR(skb)) - goto out; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + + if (tunnel->collect_md) { + gre_fb_xmit(skb, dev, skb->protocol); + return NETDEV_TX_OK; + } if (dev->header_ops) { - /* Need space for new headers */ - if (skb_cow_head(skb, dev->needed_headroom - - (tunnel->hlen + sizeof(struct iphdr)))) + int pull_len = tunnel->hlen + sizeof(struct iphdr); + + if (skb_cow_head(skb, 0)) + goto free_skb; + + if (!pskb_may_pull(skb, pull_len)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; - /* Pull skb since ip_tunnel_xmit() needs skb->data pointing - * to gre header. - */ - skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); + /* ip_tunnel_xmit() needs skb->data pointing to gre header. */ + skb_pull(skb, pull_len); + skb_reset_mac_header(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_start(skb) < skb->data) + goto free_skb; } else { if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; @@ -272,14 +685,71 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, tnl_params = &tunnel->parms.iph; } + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) + goto free_skb; + __gre_xmit(skb, dev, tnl_params, skb->protocol); + return NETDEV_TX_OK; +free_skb: + kfree_skb(skb); + DEV_STATS_INC(dev, tx_dropped); + return NETDEV_TX_OK; +} + +static netdev_tx_t erspan_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + bool truncate = false; + __be16 proto; + + if (!pskb_inet_may_pull(skb)) + goto free_skb; + + if (tunnel->collect_md) { + erspan_fb_xmit(skb, dev); + return NETDEV_TX_OK; + } + + if (gre_handle_offloads(skb, false)) + goto free_skb; + + if (skb_cow_head(skb, dev->needed_headroom)) + goto free_skb; + + if (skb->len > dev->mtu + dev->hard_header_len) { + if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) + goto free_skb; + truncate = true; + } + + /* Push ERSPAN header */ + if (tunnel->erspan_ver == 0) { + proto = htons(ETH_P_ERSPAN); + __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags); + } else if (tunnel->erspan_ver == 1) { + erspan_build_header(skb, ntohl(tunnel->parms.o_key), + tunnel->index, + truncate, true); + proto = htons(ETH_P_ERSPAN); + } else if (tunnel->erspan_ver == 2) { + erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), + tunnel->dir, tunnel->hwid, + truncate, true); + proto = htons(ETH_P_ERSPAN2); + } else { + goto free_skb; + } + + __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags); + __gre_xmit(skb, dev, &tunnel->parms.iph, proto); return NETDEV_TX_OK; free_skb: - dev_kfree_skb(skb); -out: - dev->stats.tx_dropped++; + kfree_skb(skb); + DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } @@ -288,50 +758,102 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); - skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); - if (IS_ERR(skb)) - goto out; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + + if (tunnel->collect_md) { + gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); + return NETDEV_TX_OK; + } + + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) + goto free_skb; if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); - return NETDEV_TX_OK; free_skb: - dev_kfree_skb(skb); -out: - dev->stats.tx_dropped++; + kfree_skb(skb); + DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } -static int ipgre_tunnel_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) +static void ipgre_link_update(struct net_device *dev, bool set_mtu) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int len; + + len = tunnel->tun_hlen; + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); + len = tunnel->tun_hlen - len; + tunnel->hlen = tunnel->hlen + len; + + if (dev->header_ops) + dev->hard_header_len += len; + else + dev->needed_headroom += len; + + if (set_mtu) + WRITE_ONCE(dev->mtu, max_t(int, dev->mtu - len, 68)); + + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) || + (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && + tunnel->encap.type != TUNNEL_ENCAP_NONE)) { + dev->features &= ~NETIF_F_GSO_SOFTWARE; + dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; + } else { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } +} + +static int ipgre_tunnel_ctl(struct net_device *dev, + struct ip_tunnel_parm_kern *p, + int cmd) { - int err = 0; - struct ip_tunnel_parm p; + __be16 i_flags, o_flags; + int err; + + if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || + !ip_tunnel_flags_is_be16_compat(p->o_flags)) + return -EOVERFLOW; + + i_flags = ip_tunnel_flags_to_be16(p->i_flags); + o_flags = ip_tunnel_flags_to_be16(p->o_flags); - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || - ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || + ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } - p.i_flags = gre_flags_to_tnl_flags(p.i_flags); - p.o_flags = gre_flags_to_tnl_flags(p.o_flags); - err = ip_tunnel_ioctl(dev, &p, cmd); + gre_flags_to_tnl_flags(p->i_flags, i_flags); + gre_flags_to_tnl_flags(p->o_flags, o_flags); + + err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; - p.i_flags = tnl_flags_to_gre_flags(p.i_flags); - p.o_flags = tnl_flags_to_gre_flags(p.o_flags); + if (cmd == SIOCCHGTUNNEL) { + struct ip_tunnel *t = netdev_priv(dev); + + ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); + + if (strcmp(dev->rtnl_link_ops->kind, "erspan")) + ipgre_link_update(dev, true); + } + + i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + ip_tunnel_flags_from_be16(p->i_flags, i_flags); + o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); + ip_tunnel_flags_from_be16(p->o_flags, o_flags); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; return 0; } @@ -370,9 +892,9 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, struct iphdr *iph; struct gre_base_hdr *greh; - iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph)); + iph = skb_push(skb, t->hlen + sizeof(*iph)); greh = (struct gre_base_hdr *)(iph+1); - greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags); + greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags); greh->protocol = htons(type); memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); @@ -383,7 +905,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, if (daddr) memcpy(&iph->daddr, daddr, 4); if (iph->daddr) - return t->hlen; + return t->hlen + sizeof(*iph); return -(t->hlen + sizeof(*iph)); } @@ -406,20 +928,23 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct flowi4 fl4; + struct flowi4 fl4 = { + .flowi4_oif = t->parms.link, + .flowi4_dscp = ip4h_dscp(&t->parms.iph), + .flowi4_scope = RT_SCOPE_UNIVERSE, + .flowi4_proto = IPPROTO_GRE, + .saddr = t->parms.iph.saddr, + .daddr = t->parms.iph.daddr, + .fl4_gre_key = t->parms.o_key, + }; struct rtable *rt; - rt = ip_route_output_gre(dev_net(dev), &fl4, - t->parms.iph.daddr, - t->parms.iph.saddr, - t->parms.o_key, - RT_TOS(t->parms.iph.tos), - t->parms.link); + rt = ip_route_output_key(t->net, &fl4); if (IS_ERR(rt)) return -EADDRNOTAVAIL; dev = rt->dst.dev; ip_rt_put(rt); - if (__in_dev_get_rtnl(dev) == NULL) + if (!__in_dev_get_rtnl(dev)) return -EADDRNOTAVAIL; t->mlink = dev->ifindex; ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); @@ -433,7 +958,7 @@ static int ipgre_close(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; - in_dev = inetdev_by_index(dev_net(dev), t->mlink); + in_dev = inetdev_by_index(t->net, t->mlink); if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); } @@ -449,9 +974,11 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, - .ndo_do_ioctl = ipgre_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, + .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipgre_tunnel_ctl, }; #define GRE_FEATURES (NETIF_F_SG | \ @@ -462,6 +989,7 @@ static const struct net_device_ops ipgre_netdev_ops = { static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; + dev->type = ARPHRD_IPGRE; ip_tunnel_setup(dev, ipgre_net_id); } @@ -470,24 +998,28 @@ static void __gre_tunnel_init(struct net_device *dev) struct ip_tunnel *tunnel; tunnel = netdev_priv(dev); - tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); tunnel->parms.iph.protocol = IPPROTO_GRE; - dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); - dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES; + dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; - if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported. */ - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - /* Can use a lockless transmit, unless we generate - * output sequences - */ - dev->features |= NETIF_F_LLTX; - } + /* TCP offload with GRE SEQ is not supported, nor can we support 2 + * levels of outer headers requiring an update. + */ + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags)) + return; + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && + tunnel->encap.type != TUNNEL_ENCAP_NONE) + return; + + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + + dev->lltx = true; } static int ipgre_tunnel_init(struct net_device *dev) @@ -497,33 +1029,36 @@ static int ipgre_tunnel_init(struct net_device *dev) __gre_tunnel_init(dev); - memcpy(dev->dev_addr, &iph->saddr, 4); + __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->type = ARPHRD_IPGRE; dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->addr_len = 4; - if (iph->daddr) { + if (iph->daddr && !tunnel->collect_md) { #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { if (!iph->saddr) return -EINVAL; dev->flags = IFF_BROADCAST; dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } #endif - } else + } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; + } return ip_tunnel_init(dev); } -static struct gre_cisco_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, - .priority = 0, +static const struct gre_protocol ipgre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, }; static int __net_init ipgre_init_net(struct net *net) @@ -531,20 +1066,21 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); - ip_tunnel_delete_net(itn); + ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit = ipgre_exit_net, + .exit_rtnl = ipgre_exit_rtnl, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; -static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { __be16 flags; @@ -559,10 +1095,16 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) if (flags & (GRE_VERSION|GRE_ROUTING)) return -EINVAL; + if (data[IFLA_GRE_COLLECT_METADATA] && + data[IFLA_GRE_ENCAP_TYPE] && + nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE) + return -EINVAL; + return 0; } -static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) +static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { __be32 daddr; @@ -583,27 +1125,74 @@ static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) } out: - return ipgre_tunnel_validate(tb, data); + return ipgre_tunnel_validate(tb, data, extack); +} + +static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + __be16 flags = 0; + int ret; + + if (!data) + return 0; + + ret = ipgre_tap_validate(tb, data, extack); + if (ret) + return ret; + + if (data[IFLA_GRE_ERSPAN_VER] && + nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0) + return 0; + + /* ERSPAN type II/III should only have GRE sequence and key flag */ + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (!data[IFLA_GRE_COLLECT_METADATA] && + flags != (GRE_SEQ | GRE_KEY)) + return -EINVAL; + + /* ERSPAN Session ID only has 10-bit. Since we reuse + * 32-bit key field as ID, check it's range. + */ + if (data[IFLA_GRE_IKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_OKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) + return -EINVAL; + + return 0; } -static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms) +static int ipgre_netlink_parms(struct net_device *dev, + struct nlattr *data[], + struct nlattr *tb[], + struct ip_tunnel_parm_kern *parms, + __u32 *fwmark) { + struct ip_tunnel *t = netdev_priv(dev); + memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_GRE; if (!data) - return; + return 0; if (data[IFLA_GRE_LINK]) parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS])); + gre_flags_to_tnl_flags(parms->i_flags, + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS])); + gre_flags_to_tnl_flags(parms->o_flags, + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -612,10 +1201,10 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); if (data[IFLA_GRE_LOCAL]) - parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); + parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]); if (data[IFLA_GRE_REMOTE]) - parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); + parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]); if (data[IFLA_GRE_TTL]) parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); @@ -623,13 +1212,114 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], if (data[IFLA_GRE_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); - if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) + if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) { + if (t->ignore_df) + return -EINVAL; parms->iph.frag_off = htons(IP_DF); + } + + if (data[IFLA_GRE_COLLECT_METADATA]) { + t->collect_md = true; + if (dev->type == ARPHRD_IPGRE) + dev->type = ARPHRD_NONE; + } + + if (data[IFLA_GRE_IGNORE_DF]) { + if (nla_get_u8(data[IFLA_GRE_IGNORE_DF]) + && (parms->iph.frag_off & htons(IP_DF))) + return -EINVAL; + t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); + } + + if (data[IFLA_GRE_FWMARK]) + *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + + return 0; +} + +static int erspan_netlink_parms(struct net_device *dev, + struct nlattr *data[], + struct nlattr *tb[], + struct ip_tunnel_parm_kern *parms, + __u32 *fwmark) +{ + struct ip_tunnel *t = netdev_priv(dev); + int err; + + err = ipgre_netlink_parms(dev, data, tb, parms, fwmark); + if (err) + return err; + if (!data) + return 0; + + if (data[IFLA_GRE_ERSPAN_VER]) { + t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + + if (t->erspan_ver > 2) + return -EINVAL; + } + + if (t->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (t->index & ~INDEX_MASK) + return -EINVAL; + } + } else if (t->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (t->dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + if (data[IFLA_GRE_ERSPAN_HWID]) { + t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + if (t->hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + + return 0; +} + +/* This function returns true when ENCAP attributes are present in the nl msg */ +static bool ipgre_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_GRE_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); + } + + if (data[IFLA_GRE_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); + } + + if (data[IFLA_GRE_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]); + } + + if (data[IFLA_GRE_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]); + } + + return ret; } static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } @@ -641,32 +1331,166 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, + .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_fill_metadata_dst = gre_fill_metadata_dst, +}; + +static int erspan_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + if (tunnel->erspan_ver == 0) + tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */ + else + tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */ + + tunnel->parms.iph.protocol = IPPROTO_GRE; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + erspan_hdr_len(tunnel->erspan_ver); + + dev->features |= GRE_FEATURES; + dev->hw_features |= GRE_FEATURES; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); + + return ip_tunnel_init(dev); +} + +static const struct net_device_ops erspan_netdev_ops = { + .ndo_init = erspan_tunnel_init, + .ndo_uninit = ip_tunnel_uninit, + .ndo_start_xmit = erspan_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = dev_get_tstats64, + .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); - dev->netdev_ops = &gre_tap_netdev_ops; + dev->max_mtu = 0; + dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, gre_tap_net_id); } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +static int +ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[]) +{ + struct ip_tunnel_encap ipencap; + + if (ipgre_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } + + return 0; +} + +static int ipgre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, + struct netlink_ext_ack *extack) +{ + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; + struct ip_tunnel_parm_kern p; + __u32 fwmark = 0; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); + if (err < 0) + return err; + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); +} + +static int erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, + struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; + struct ip_tunnel_parm_kern p; + __u32 fwmark = 0; + int err; - ipgre_netlink_parms(data, tb, &p); - return ip_tunnel_newlink(dev, tb, &p); + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); + if (err) + return err; + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) + struct nlattr *data[], + struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; + __u32 fwmark = t->fwmark; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); + if (err < 0) + return err; + + err = ip_tunnel_changelink(dev, tb, &p, fwmark); + if (err < 0) + return err; + + ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); + + ipgre_link_update(dev, !tb[IFLA_MTU]); + + return 0; +} + +static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; + __u32 fwmark = t->fwmark; + int err; - ipgre_netlink_parms(data, tb, &p); - return ip_tunnel_changelink(dev, tb, &p); + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); + if (err < 0) + return err; + + err = ip_tunnel_changelink(dev, tb, &p, fwmark); + if (err < 0) + return err; + + ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); + + return 0; } static size_t ipgre_get_size(const struct net_device *dev) @@ -692,43 +1516,142 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_GRE_PMTUDISC */ nla_total_size(1) + + /* IFLA_GRE_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_DPORT */ + nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + + /* IFLA_GRE_IGNORE_DF */ + nla_total_size(1) + + /* IFLA_GRE_FWMARK */ + nla_total_size(4) + + /* IFLA_GRE_ERSPAN_INDEX */ + nla_total_size(4) + + /* IFLA_GRE_ERSPAN_VER */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_DIR */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_HWID */ + nla_total_size(2) + 0; } static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; + struct ip_tunnel_parm_kern *p = &t->parms; + IP_TUNNEL_DECLARE_FLAGS(o_flags); + + ip_tunnel_flags_copy(o_flags, p->o_flags); if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, + gre_tnl_flags_to_gre_flags(p->i_flags)) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, + gre_tnl_flags_to_gre_flags(o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || - nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || - nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || + nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || + nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) || nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || nla_put_u8(skb, IFLA_GRE_PMTUDISC, - !!(p->iph.frag_off & htons(IP_DF)))) + !!(p->iph.frag_off & htons(IP_DF))) || + nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark)) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, + t->encap.type) || + nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT, + t->encap.sport) || + nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT, + t->encap.dport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, + t->encap.flags)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df)) + goto nla_put_failure; + + if (t->collect_md) { + if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) + goto nla_put_failure; + } + return 0; nla_put_failure: return -EMSGSIZE; } +static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + + if (t->erspan_ver <= 2) { + if (t->erspan_ver != 0 && !t->collect_md) + __set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) + goto nla_put_failure; + } else if (t->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } + } + + return ipgre_fill_info(skb, dev); + +nla_put_failure: + return -EMSGSIZE; +} + +static void erspan_setup(struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + + ether_setup(dev); + dev->max_mtu = 0; + dev->netdev_ops = &erspan_netdev_ops; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + ip_tunnel_setup(dev, erspan_net_id); + t->erspan_ver = 1; +} + static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_LINK] = { .type = NLA_U32 }, [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, [IFLA_GRE_IKEY] = { .type = NLA_U32 }, [IFLA_GRE_OKEY] = { .type = NLA_U32 }, - [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, - [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, + [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_TOS] = { .type = NLA_U8 }, [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, + [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, + [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { @@ -743,6 +1666,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = { .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, + .get_link_net = ip_tunnel_get_link_net, }; static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { @@ -757,26 +1681,108 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, + .get_link_net = ip_tunnel_get_link_net, +}; + +static struct rtnl_link_ops erspan_link_ops __read_mostly = { + .kind = "erspan", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = erspan_setup, + .validate = erspan_validate, + .newlink = erspan_newlink, + .changelink = erspan_changelink, + .dellink = ip_tunnel_dellink, + .get_size = ipgre_get_size, + .fill_info = erspan_fill_info, + .get_link_net = ip_tunnel_get_link_net, }; +struct net_device *gretap_fb_dev_create(struct net *net, const char *name, + u8 name_assign_type) +{ + struct rtnl_newlink_params params = { .src_net = net }; + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + LIST_HEAD(list_kill); + struct ip_tunnel *t; + int err; + + memset(&tb, 0, sizeof(tb)); + params.tb = tb; + + dev = rtnl_create_link(net, name, name_assign_type, + &ipgre_tap_ops, tb, NULL); + if (IS_ERR(dev)) + return dev; + + /* Configure flow based GRE device. */ + t = netdev_priv(dev); + t->collect_md = true; + + err = ipgre_newlink(dev, ¶ms, NULL); + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } + + /* openvswitch users expect packet sizes to be unrestricted, + * so set the largest MTU we can. + */ + err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); + if (err) + goto out; + + err = rtnl_configure_link(dev, NULL, 0, NULL); + if (err < 0) + goto out; + + return dev; +out: + ip_tunnel_dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(gretap_fb_dev_create); + static int __net_init ipgre_tap_init_net(struct net *net) { - return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL); + return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_net(struct net *net) +static void __net_exit ipgre_tap_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - ip_tunnel_delete_net(itn); + ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit = ipgre_tap_exit_net, + .exit_rtnl = ipgre_tap_exit_rtnl, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; +static int __net_init erspan_init_net(struct net *net) +{ + return ip_tunnel_init_net(net, erspan_net_id, + &erspan_link_ops, "erspan0"); +} + +static void __net_exit erspan_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) +{ + ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill); +} + +static struct pernet_operations erspan_net_ops = { + .init = erspan_init_net, + .exit_rtnl = erspan_exit_rtnl, + .id = &erspan_net_id, + .size = sizeof(struct ip_tunnel_net), +}; + static int __init ipgre_init(void) { int err; @@ -789,9 +1795,13 @@ static int __init ipgre_init(void) err = register_pernet_device(&ipgre_tap_net_ops); if (err < 0) - goto pnet_tap_faied; + goto pnet_tap_failed; + + err = register_pernet_device(&erspan_net_ops); + if (err < 0) + goto pnet_erspan_failed; - err = gre_cisco_register(&ipgre_protocol); + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); goto add_proto_failed; @@ -805,15 +1815,23 @@ static int __init ipgre_init(void) if (err < 0) goto tap_ops_failed; + err = rtnl_link_register(&erspan_link_ops); + if (err < 0) + goto erspan_link_failed; + return 0; +erspan_link_failed: + rtnl_link_unregister(&ipgre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - gre_cisco_unregister(&ipgre_protocol); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: + unregister_pernet_device(&erspan_net_ops); +pnet_erspan_failed: unregister_pernet_device(&ipgre_tap_net_ops); -pnet_tap_faied: +pnet_tap_failed: unregister_pernet_device(&ipgre_net_ops); return err; } @@ -822,15 +1840,20 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - gre_cisco_unregister(&ipgre_protocol); + rtnl_link_unregister(&erspan_link_ops); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); + unregister_pernet_device(&erspan_net_ops); } module_init(ipgre_init); module_exit(ipgre_fini); +MODULE_DESCRIPTION("IPv4 GRE tunnels over IP library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); +MODULE_ALIAS_RTNL_LINK("erspan"); MODULE_ALIAS_NETDEV("gre0"); MODULE_ALIAS_NETDEV("gretap0"); +MODULE_ALIAS_NETDEV("erspan0"); |
