diff options
Diffstat (limited to 'net/ipv6/sit.c')
| -rw-r--r-- | net/ipv6/sit.c | 1235 |
1 files changed, 734 insertions, 501 deletions
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a3437a4cd07e..cf37ad9686e6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT) * Linux INET6 implementation @@ -6,11 +7,6 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Changes: * Roger Venning <r.venning@telstra.com>: 6to4 support * Nate Thompson <nate@thebog.net>: 6to4 support @@ -31,7 +27,7 @@ #include <linux/if_arp.h> #include <linux/icmp.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> @@ -55,6 +51,8 @@ #include <net/dsfield.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> +#include <net/inet_dscp.h> /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -62,7 +60,7 @@ For comments look at net/ipv4/ip_gre.c --ANK */ -#define HASH_SIZE 16 +#define IP6_SIT_HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) static bool log_ecn_error = true; @@ -76,55 +74,68 @@ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, __be32 *v4dst); static struct rtnl_link_ops sit_link_ops __read_mostly; -static int sit_net_id __read_mostly; +static unsigned int sit_net_id __read_mostly; struct sit_net { - struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_wc[1]; struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; +static inline struct sit_net *dev_to_sit_net(struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + + return net_generic(t->net, sit_net_id); +} + /* * Must be invoked with rcu_read_lock */ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, - struct net_device *dev, __be32 remote, __be32 local) + struct net_device *dev, + __be32 remote, __be32 local, + int sifindex) { unsigned int h0 = HASH(remote); unsigned int h1 = HASH(local); struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); + int ifindex = dev ? dev->ifindex : 0; for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || ifindex == t->parms.link || + sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || ifindex == t->parms.link || + sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || ifindex == t->parms.link || + sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } t = rcu_dereference(sitn->tunnels_wc[0]); - if ((t != NULL) && (t->dev->flags & IFF_UP)) + if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } -static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, - struct ip_tunnel_parm *parms) +static struct ip_tunnel __rcu ** +__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -176,7 +187,7 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel *t = netdev_priv(dev); - if (t->dev == sitn->fb_tunnel_dev) { + if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) { ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); t->ip6rd.relay_prefix = 0; t->ip6rd.prefixlen = 16; @@ -191,26 +202,22 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) static int ipip6_tunnel_create(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct sit_net *sitn = net_generic(net, sit_net_id); + struct sit_net *sitn = net_generic(t->net, sit_net_id); int err; - err = ipip6_tunnel_init(dev); - if (err < 0) - goto out; - ipip6_tunnel_clone_6rd(dev, sitn); + __dev_addr_set(dev, &t->parms.iph.saddr, 4); + memcpy(dev->broadcast, &t->parms.iph.daddr, 4); - if ((__force u16)t->parms.i_flags & SIT_ISATAP) + if (test_bit(IP_TUNNEL_SIT_ISATAP_BIT, t->parms.i_flags)) dev->priv_flags |= IFF_ISATAP; + dev->rtnl_link_ops = &sit_link_ops; + err = register_netdevice(dev); if (err < 0) goto out; - strcpy(t->parms.name, dev->name); - dev->rtnl_link_ops = &sit_link_ops; - - dev_hold(dev); + ipip6_tunnel_clone_6rd(dev, sitn); ipip6_tunnel_link(sitn, t); return 0; @@ -220,7 +227,8 @@ out: } static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) + struct ip_tunnel_parm_kern *parms, + int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -245,27 +253,34 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, if (!create) goto failed; - if (parms->name[0]) - strlcpy(name, parms->name, IFNAMSIZ); - else + if (parms->name[0]) { + if (!dev_valid_name(parms->name)) + goto failed; + strscpy(name, parms->name, IFNAMSIZ); + } else { strcpy(name, "sit%d"); - - dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); - if (dev == NULL) + } + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ipip6_tunnel_setup); + if (!dev) return NULL; dev_net_set(dev, net); nt = netdev_priv(dev); + nt->net = net; nt->parms = *parms; if (ipip6_tunnel_create(dev) < 0) goto failed_free; + if (!parms->name[0]) + strcpy(parms->name, dev->name); + return nt; failed_free: - ipip6_dev_free(dev); + free_netdev(dev); failed: return NULL; } @@ -287,14 +302,17 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) } -static int ipip6_tunnel_get_prl(struct ip_tunnel *t, - struct ip_tunnel_prl __user *a) +static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a) { + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl kprl, *kp; struct ip_tunnel_prl_entry *prl; unsigned int cmax, c = 0, ca, len; int ret = 0; + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) + return -EINVAL; + if (copy_from_user(&kprl, a, sizeof(kprl))) return -EFAULT; cmax = kprl.datalen / sizeof(kprl); @@ -305,12 +323,10 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t, * we try harder to allocate. */ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? - kcalloc(cmax, sizeof(*kp), GFP_KERNEL) : + kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; - rcu_read_lock(); - - ca = t->prl_count < cmax ? t->prl_count : cmax; + ca = min(t->prl_count, cmax); if (!kp) { /* We don't try hard to allocate much memory for @@ -318,14 +334,15 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t, * For root users, retry allocating enough memory for * the answer. */ - kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC); + kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT | + __GFP_NOWARN); if (!kp) { ret = -ENOMEM; goto out; } } - c = 0; + rcu_read_lock(); for_each_prl_rcu(t->prl) { if (c >= cmax) break; @@ -337,7 +354,7 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t, if (kprl.addr != htonl(INADDR_ANY)) break; } -out: + rcu_read_unlock(); len = sizeof(*kp) * c; @@ -346,7 +363,7 @@ out: ret = -EFAULT; kfree(kp); - +out: return ret; } @@ -437,6 +454,35 @@ out: return err; } +static int ipip6_tunnel_prl_ctl(struct net_device *dev, + struct ip_tunnel_prl __user *data, int cmd) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_prl prl; + int err; + + if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) + return -EINVAL; + + if (copy_from_user(&prl, data, sizeof(prl))) + return -EFAULT; + + switch (cmd) { + case SIOCDELPRL: + err = ipip6_tunnel_del_prl(t, &prl); + break; + case SIOCADDPRL: + case SIOCCHGPRL: + err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); + break; + } + dst_cache_reset(&t->dst_cache); + netdev_state_change(dev); + return err; +} + static int isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) { @@ -475,21 +521,18 @@ static void ipip6_tunnel_uninit(struct net_device *dev) ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } - dev_put(dev); + dst_cache_reset(&tunnel->dst_cache); + netdev_put(dev, &tunnel->dev_tracker); } - static int ipip6_err(struct sk_buff *skb, u32 info) { - -/* All the routers (except for Linux) return only - 8 bytes of packet payload. It means, that precise relaying of - ICMP in the real Internet is absolutely infeasible. - */ const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + unsigned int data_len = 0; struct ip_tunnel *t; + int sifindex; int err; switch (type) { @@ -500,7 +543,6 @@ static int ipip6_err(struct sk_buff *skb, u32 info) case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: - case ICMP_PORT_UNREACH: /* Impossible event. */ return 0; default: @@ -514,6 +556,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: break; @@ -521,30 +564,33 @@ static int ipip6_err(struct sk_buff *skb, u32 info) err = -ENOENT; - t = ipip6_tunnel_lookup(dev_net(skb->dev), - skb->dev, - iph->daddr, - iph->saddr); - if (t == NULL) + sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; + t = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, + iph->daddr, iph->saddr, sifindex); + if (!t) goto out; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->dev->ifindex, 0, IPPROTO_IPV6, 0); + t->parms.link, iph->protocol); err = 0; goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, - IPPROTO_IPV6, 0); + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, + iph->protocol); err = 0; goto out; } + err = 0; + if (__in6_dev_get(skb->dev) && + !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len)) + goto out; + if (t->parms.iph.daddr == 0) goto out; - err = 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; @@ -566,44 +612,104 @@ static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr, return false; } +/* Checks if an address matches an address on the tunnel interface. + * Used to detect the NAT of proto 41 packets and let them pass spoofing test. + * Long story: + * This function is called after we considered the packet as spoofed + * in is_spoofed_6rd. + * We may have a router that is doing NAT for proto 41 packets + * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb + * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd + * function will return true, dropping the packet. + * But, we can still check if is spoofed against the IP + * addresses associated with the interface. + */ +static bool only_dnatted(const struct ip_tunnel *tunnel, + const struct in6_addr *v6dst) +{ + int prefix_len; + +#ifdef CONFIG_IPV6_SIT_6RD + prefix_len = tunnel->ip6rd.prefixlen + 32 + - tunnel->ip6rd.relay_prefixlen; +#else + prefix_len = 48; +#endif + return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev); +} + +/* Returns true if a packet is spoofed */ +static bool packet_is_spoofed(struct sk_buff *skb, + const struct iphdr *iph, + struct ip_tunnel *tunnel) +{ + const struct ipv6hdr *ipv6h; + + if (tunnel->dev->priv_flags & IFF_ISATAP) { + if (!isatap_chksrc(skb, iph, tunnel)) + return true; + + return false; + } + + if (tunnel->dev->flags & IFF_POINTOPOINT) + return false; + + ipv6h = ipv6_hdr(skb); + + if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) { + net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n", + &iph->saddr, &ipv6h->saddr, + &iph->daddr, &ipv6h->daddr); + return true; + } + + if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr))) + return false; + + if (only_dnatted(tunnel, &ipv6h->daddr)) + return false; + + net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n", + &iph->saddr, &ipv6h->saddr, + &iph->daddr, &ipv6h->daddr); + return true; +} + static int ipip6_rcv(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct ip_tunnel *tunnel; + int sifindex; int err; + sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, - iph->saddr, iph->daddr); - if (tunnel != NULL) { - struct pcpu_tstats *tstats; - + iph->saddr, iph->daddr, sifindex); + if (tunnel) { if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && tunnel->parms.iph.protocol != 0) goto out; - secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); IPCB(skb)->flags = 0; - skb->protocol = htons(ETH_P_IPV6); - skb->pkt_type = PACKET_HOST; + skb->dev = tunnel->dev; - if (tunnel->dev->priv_flags & IFF_ISATAP) { - if (!isatap_chksrc(skb, iph, tunnel)) { - tunnel->dev->stats.rx_errors++; - goto out; - } - } else if (!(tunnel->dev->flags&IFF_POINTOPOINT)) { - if (is_spoofed_6rd(tunnel, iph->saddr, - &ipv6_hdr(skb)->saddr) || - is_spoofed_6rd(tunnel, iph->daddr, - &ipv6_hdr(skb)->daddr)) { - tunnel->dev->stats.rx_errors++; - goto out; - } + if (packet_is_spoofed(skb, iph, tunnel)) { + DEV_STATS_INC(tunnel->dev, rx_errors); + goto out; } - __skb_tunnel_rx(skb, tunnel->dev); + if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6), + !net_eq(tunnel->net, dev_net(tunnel->dev)))) + goto out; + + /* skb can be uncloned in iptunnel_pull_header, so + * old iph is no longer valid + */ + iph = (const struct iphdr *)skb_mac_header(skb); + skb_reset_mac_header(skb); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { @@ -611,18 +717,14 @@ static int ipip6_rcv(struct sk_buff *skb) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &iph->saddr, iph->tos); if (err > 1) { - ++tunnel->dev->stats.rx_frame_errors; - ++tunnel->dev->stats.rx_errors; + DEV_STATS_INC(tunnel->dev, rx_frame_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); goto out; } } - tstats = this_cpu_ptr(tunnel->dev->tstats); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; + dev_sw_netstats_rx_add(tunnel->dev, skb->len); - if (tunnel->net != dev_net(tunnel->dev)) - skb_scrub_packet(skb); netif_rx(skb); return 0; @@ -635,31 +737,49 @@ out: return 0; } -static const struct tnl_ptk_info tpi = { +static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; -static int ipip_rcv(struct sk_buff *skb) +#if IS_ENABLED(CONFIG_MPLS) +static const struct tnl_ptk_info mplsip_tpi = { + /* no tunnel info required for mplsip. */ + .proto = htons(ETH_P_MPLS_UC), +}; +#endif + +static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { const struct iphdr *iph; struct ip_tunnel *tunnel; + int sifindex; - if (iptunnel_pull_header(skb, 0, tpi.proto)) - goto drop; + sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; iph = ip_hdr(skb); - tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, - iph->saddr, iph->daddr); - if (tunnel != NULL) { - if (tunnel->parms.iph.protocol != IPPROTO_IPIP && + iph->saddr, iph->daddr, sifindex); + if (tunnel) { + const struct tnl_ptk_info *tpi; + + if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); +#if IS_ENABLED(CONFIG_MPLS) + if (ipproto == IPPROTO_MPLS) + tpi = &mplsip_tpi; + else +#endif + tpi = &ipip_tpi; + if (iptunnel_pull_header(skb, 0, tpi->proto, false)) + goto drop; + skb_reset_mac_header(skb); + + return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return 1; @@ -669,6 +789,18 @@ drop: return 0; } +static int ipip_rcv(struct sk_buff *skb) +{ + return sit_tunnel_rcv(skb, IPPROTO_IPIP); +} + +#if IS_ENABLED(CONFIG_MPLS) +static int mplsip_rcv(struct sk_buff *skb) +{ + return sit_tunnel_rcv(skb, IPPROTO_MPLS); +} +#endif + /* * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function * stores the embedded IPv4 address in v4dst and returns true. @@ -686,8 +818,9 @@ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, pbw0 = tunnel->ip6rd.prefixlen >> 5; pbi0 = tunnel->ip6rd.prefixlen & 0x1f; - d = (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >> - tunnel->ip6rd.relay_prefixlen; + d = tunnel->ip6rd.relay_prefixlen < 32 ? + (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >> + tunnel->ip6rd.relay_prefixlen : 0; pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen; if (pbi1 > 0) @@ -715,6 +848,49 @@ static inline __be32 try_6rd(struct ip_tunnel *tunnel, return dst; } +static bool ipip6_tunnel_dst_find(struct sk_buff *skb, __be32 *dst, + bool is_isatap) +{ + const struct ipv6hdr *iph6 = ipv6_hdr(skb); + struct neighbour *neigh = NULL; + const struct in6_addr *addr6; + bool found = false; + int addr_type; + + if (skb_dst(skb)) + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); + + if (!neigh) { + net_dbg_ratelimited("nexthop == NULL\n"); + return false; + } + + addr6 = (const struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (is_isatap) { + if ((addr_type & IPV6_ADDR_UNICAST) && + ipv6_addr_is_isatap(addr6)) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } else { + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) != 0) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } + + neigh_release(neigh); + + return found; +} + /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. @@ -728,110 +904,68 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, const struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - unsigned int max_headroom; /* The extra header space needed */ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; - const struct in6_addr *addr6; - int addr_type; u8 ttl; - int err; - - if (skb->protocol != htons(ETH_P_IPV6)) - goto tx_error; + u8 protocol = IPPROTO_IPV6; + int t_hlen = tunnel->hlen + sizeof(struct iphdr); if (tos == 1) tos = ipv6_get_dsfield(iph6); /* ISATAP (RFC4214) - must come before 6to4 */ - if (dev->priv_flags & IFF_ISATAP) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (neigh == NULL) { - net_dbg_ratelimited("sit: nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if ((addr_type & IPV6_ADDR_UNICAST) && - ipv6_addr_is_isatap(addr6)) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; - } + if ((dev->priv_flags & IFF_ISATAP) && + !ipip6_tunnel_dst_find(skb, &dst, true)) + goto tx_error; if (!dst) dst = try_6rd(tunnel, &iph6->daddr); - if (!dst) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (neigh == NULL) { - net_dbg_ratelimited("sit: nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); + if (!dst && !ipip6_tunnel_dst_find(skb, &dst, false)) + goto tx_error; - if (addr_type == IPV6_ADDR_ANY) { - addr6 = &ipv6_hdr(skb)->daddr; - addr_type = ipv6_addr_type(addr6); + flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, + tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, + IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, + sock_net_uid(tunnel->net, NULL)); + + rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); + if (!rt) { + rt = ip_route_output_flow(tunnel->net, &fl4, NULL); + if (IS_ERR(rt)) { + DEV_STATS_INC(dev, tx_carrier_errors); + goto tx_error_icmp; } - - if ((addr_type & IPV6_ADDR_COMPATv4) != 0) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } - rt = ip_route_output_ports(tunnel->net, &fl4, NULL, - dst, tiph->saddr, - 0, 0, - IPPROTO_IPV6, RT_TOS(tos), - tunnel->parms.link); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; - } - if (rt->rt_type != RTN_UNICAST) { + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); + goto tx_error; + } + + if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) { + ip_rt_put(rt); goto tx_error; } if (df) { - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - t_hlen; - if (mtu < 68) { - dev->stats.collisions++; + if (mtu < IPV4_MIN_MTU) { + DEV_STATS_INC(dev, collisions); ip_rt_put(rt); goto tx_error; } @@ -841,11 +975,11 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, df = 0; } - if (tunnel->parms.iph.daddr && skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (tunnel->parms.iph.daddr) + skb_dst_update_pmtu_no_confirm(skb, mtu); - if (skb->len > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + if (skb->len > mtu && !skb_is_gso(skb)) { + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; } @@ -860,21 +994,18 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tunnel->err_count = 0; } - if (tunnel->net != dev_net(dev)) - skb_scrub_packet(skb); - /* * Okay, now see if we can stuff it in the buffer as-is. */ - max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr); + max_headroom = LL_RESERVED_SPACE(tdev) + t_hlen; if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - dev->stats.tx_dropped++; - dev_kfree_skb(skb); + DEV_STATS_INC(dev, tx_dropped); + kfree_skb(skb); return NETDEV_TX_OK; } if (skb->sk) @@ -888,43 +1019,62 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr, - IPPROTO_IPV6, tos, ttl, df); - iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) { + ip_rt_put(rt); + goto tx_error; + } + + skb_set_inner_ipproto(skb, IPPROTO_IPV6); + + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, + df, !net_eq(tunnel->net, dev_net(dev)), 0); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: - dev->stats.tx_errors++; - dev_kfree_skb(skb); + kfree_skb(skb); + DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } -static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb, + struct net_device *dev, u8 ipproto) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } + if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) + goto tx_error; + + skb_set_inner_ipproto(skb, ipproto); - ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); + ip_tunnel_xmit(skb, dev, tiph, ipproto); + return NETDEV_TX_OK; +tx_error: + kfree_skb(skb); + DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { + if (!pskb_inet_may_pull(skb)) + goto tx_err; + switch (skb->protocol) { case htons(ETH_P_IP): - ipip_tunnel_xmit(skb, dev); + sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); break; case htons(ETH_P_IPV6): ipip6_tunnel_xmit(skb, dev); break; +#if IS_ENABLED(CONFIG_MPLS) + case htons(ETH_P_MPLS_UC): + sit_tunnel_xmit__(skb, dev, IPPROTO_MPLS); + break; +#endif default: goto tx_err; } @@ -932,20 +1082,21 @@ static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, return NETDEV_TX_OK; tx_err: - dev->stats.tx_errors++; - dev_kfree_skb(skb); + DEV_STATS_INC(dev, tx_errors); + kfree_skb(skb); return NETDEV_TX_OK; } static void ipip6_tunnel_bind_dev(struct net_device *dev) { + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); struct net_device *tdev = NULL; - struct ip_tunnel *tunnel; + int hlen = LL_MAX_HEADER; const struct iphdr *iph; struct flowi4 fl4; - tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; if (iph->daddr) { @@ -954,7 +1105,7 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) iph->daddr, iph->saddr, 0, 0, IPPROTO_IPV6, - RT_TOS(iph->tos), + iph->tos & INET_DSCP_MASK, tunnel->parms.link); if (!IS_ERR(rt)) { @@ -967,16 +1118,21 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); - if (tdev) { - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); - dev->mtu = tdev->mtu - sizeof(struct iphdr); - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; + if (tdev && !netif_is_l3_master(tdev)) { + int mtu; + + mtu = tdev->mtu - t_hlen; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + WRITE_ONCE(dev->mtu, mtu); + hlen = tdev->hard_header_len + tdev->needed_headroom; } - dev->iflink = tunnel->parms.link; + dev->needed_headroom = t_hlen + hlen; } -static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) +static void ipip6_tunnel_update(struct ip_tunnel *t, + struct ip_tunnel_parm_kern *p, + __u32 fwmark) { struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); @@ -985,15 +1141,18 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) synchronize_net(); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; - memcpy(t->dev->dev_addr, &p->iph.saddr, 4); + __dev_addr_set(t->dev, &p->iph.saddr, 4); memcpy(t->dev->broadcast, &p->iph.daddr, 4); ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; - if (t->parms.link != p->link) { + t->parms.iph.frag_off = p->iph.frag_off; + if (t->parms.link != p->link || t->fwmark != fwmark) { t->parms.link = p->link; + t->fwmark = fwmark; ipip6_tunnel_bind_dev(t->dev); } + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); } @@ -1024,282 +1183,295 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); return 0; } -#endif static int -ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { - int err = 0; - struct ip_tunnel_parm p; - struct ip_tunnel_prl prl; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct sit_net *sitn = net_generic(net, sit_net_id); -#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; struct ip_tunnel_6rd ip6rd; -#endif - switch (cmd) { - case SIOCGETTUNNEL: -#ifdef CONFIG_IPV6_SIT_6RD - case SIOCGET6RD: -#endif - t = NULL; - if (dev == sitn->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { - err = -EFAULT; - break; - } - t = ipip6_tunnel_locate(net, &p, 0); - } - if (t == NULL) - t = netdev_priv(dev); + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + if (!ip_tunnel_parm_from_user(&p, data)) + return -EFAULT; + t = ipip6_tunnel_locate(t->net, &p, 0); + } + if (!t) + t = netdev_priv(dev); - err = -EFAULT; - if (cmd == SIOCGETTUNNEL) { - memcpy(&p, &t->parms, sizeof(p)); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, - sizeof(p))) - goto done; -#ifdef CONFIG_IPV6_SIT_6RD - } else { - ip6rd.prefix = t->ip6rd.prefix; - ip6rd.relay_prefix = t->ip6rd.relay_prefix; - ip6rd.prefixlen = t->ip6rd.prefixlen; - ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, - sizeof(ip6rd))) - goto done; + ip6rd.prefix = t->ip6rd.prefix; + ip6rd.relay_prefix = t->ip6rd.relay_prefix; + ip6rd.prefixlen = t->ip6rd.prefixlen; + ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; + if (copy_to_user(data, &ip6rd, sizeof(ip6rd))) + return -EFAULT; + return 0; +} + +static int +ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data, + int cmd) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_6rd ip6rd; + int err; + + if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (copy_from_user(&ip6rd, data, sizeof(ip6rd))) + return -EFAULT; + + if (cmd != SIOCDEL6RD) { + err = ipip6_tunnel_update_6rd(t, &ip6rd); + if (err < 0) + return err; + } else + ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev)); + return 0; +} + +#endif /* CONFIG_IPV6_SIT_6RD */ + +static bool ipip6_valid_ip_proto(u8 ipproto) +{ + return ipproto == IPPROTO_IPV6 || + ipproto == IPPROTO_IPIP || +#if IS_ENABLED(CONFIG_MPLS) + ipproto == IPPROTO_MPLS || #endif - } - err = 0; - break; + ipproto == 0; +} - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - - err = -EINVAL; - if (p.iph.protocol != IPPROTO_IPV6 && - p.iph.protocol != IPPROTO_IPIP && - p.iph.protocol != 0) - goto done; - if (p.iph.version != 4 || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) - goto done; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - - t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - - if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else { - if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || - (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { - err = -EINVAL; - break; - } - t = netdev_priv(dev); - } +static int +__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p) +{ + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; - ipip6_tunnel_update(t, &p); - } + if (!ipip6_valid_ip_proto(p->iph.protocol)) + return -EINVAL; + if (p->iph.version != 4 || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) + return -EINVAL; - if (t) { - err = 0; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); - break; + if (p->iph.ttl) + p->iph.frag_off |= htons(IP_DF); + return 0; +} - case SIOCDELTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - if (dev == sitn->fb_tunnel_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - err = -ENOENT; - if ((t = ipip6_tunnel_locate(net, &p, 0)) == NULL) - goto done; - err = -EPERM; - if (t == netdev_priv(sitn->fb_tunnel_dev)) - goto done; - dev = t->dev; - } - unregister_netdevice(dev); - err = 0; - break; +static int +ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p) +{ + struct ip_tunnel *t = netdev_priv(dev); - case SIOCGETPRL: - err = -EINVAL; - if (dev == sitn->fb_tunnel_dev) - goto done; - err = -ENOENT; - if (!(t = netdev_priv(dev))) - goto done; - err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data); - break; + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) + t = ipip6_tunnel_locate(t->net, p, 0); + if (!t) + t = netdev_priv(dev); + memcpy(p, &t->parms, sizeof(*p)); + return 0; +} - case SIOCADDPRL: - case SIOCDELPRL: - case SIOCCHGPRL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - err = -EINVAL; - if (dev == sitn->fb_tunnel_dev) - goto done; - err = -EFAULT; - if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) - goto done; - err = -ENOENT; - if (!(t = netdev_priv(dev))) - goto done; - - switch (cmd) { - case SIOCDELPRL: - err = ipip6_tunnel_del_prl(t, &prl); - break; - case SIOCADDPRL: - case SIOCCHGPRL: - err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); - break; +static int +ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p) +{ + struct ip_tunnel *t = netdev_priv(dev); + int err; + + err = __ipip6_tunnel_ioctl_validate(t->net, p); + if (err) + return err; + + t = ipip6_tunnel_locate(t->net, p, 1); + if (!t) + return -ENOBUFS; + return 0; +} + +static int +ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p) +{ + struct ip_tunnel *t = netdev_priv(dev); + int err; + + err = __ipip6_tunnel_ioctl_validate(t->net, p); + if (err) + return err; + + t = ipip6_tunnel_locate(t->net, p, 0); + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + if (!t) + return -ENOENT; + } else { + if (t) { + if (t->dev != dev) + return -EEXIST; + } else { + if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) || + (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr)) + return -EINVAL; + t = netdev_priv(dev); } - netdev_state_change(dev); - break; -#ifdef CONFIG_IPV6_SIT_6RD - case SIOCADD6RD: - case SIOCCHG6RD: - case SIOCDEL6RD: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; + ipip6_tunnel_update(t, p, t->fwmark); + } - err = -EFAULT; - if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, - sizeof(ip6rd))) - goto done; + return 0; +} - t = netdev_priv(dev); +static int +ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p) +{ + struct ip_tunnel *t = netdev_priv(dev); - if (cmd != SIOCDEL6RD) { - err = ipip6_tunnel_update_6rd(t, &ip6rd); - if (err < 0) - goto done; - } else - ipip6_tunnel_clone_6rd(dev, sitn); + if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) + return -EPERM; - err = 0; - break; -#endif + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + t = ipip6_tunnel_locate(t->net, p, 0); + if (!t) + return -ENOENT; + if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev)) + return -EPERM; + dev = t->dev; + } + unregister_netdevice(dev); + return 0; +} +static int +ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) +{ + switch (cmd) { + case SIOCGETTUNNEL: + return ipip6_tunnel_get(dev, p); + case SIOCADDTUNNEL: + return ipip6_tunnel_add(dev, p); + case SIOCCHGTUNNEL: + return ipip6_tunnel_change(dev, p); + case SIOCDELTUNNEL: + return ipip6_tunnel_del(dev, p); default: - err = -EINVAL; + return -EINVAL; } - -done: - return err; } -static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) +static int +ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { - if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + switch (cmd) { + case SIOCGETTUNNEL: + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + case SIOCDELTUNNEL: + return ip_tunnel_siocdevprivate(dev, ifr, data, cmd); + case SIOCGETPRL: + return ipip6_tunnel_get_prl(dev, data); + case SIOCADDPRL: + case SIOCDELPRL: + case SIOCCHGPRL: + return ipip6_tunnel_prl_ctl(dev, data, cmd); +#ifdef CONFIG_IPV6_SIT_6RD + case SIOCGET6RD: + return ipip6_tunnel_get6rd(dev, data); + case SIOCADD6RD: + case SIOCCHG6RD: + case SIOCDEL6RD: + return ipip6_tunnel_6rdctl(dev, data, cmd); +#endif + default: return -EINVAL; - dev->mtu = new_mtu; - return 0; + } } static const struct net_device_ops ipip6_netdev_ops = { + .ndo_init = ipip6_tunnel_init, .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, - .ndo_do_ioctl = ipip6_tunnel_ioctl, - .ndo_change_mtu = ipip6_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate, + .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; static void ipip6_dev_free(struct net_device *dev) { - free_percpu(dev->tstats); - free_netdev(dev); + struct ip_tunnel *tunnel = netdev_priv(dev); + + dst_cache_destroy(&tunnel->dst_cache); } +#define SIT_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_GSO_SOFTWARE | \ + NETIF_F_HW_CSUM) + static void ipip6_tunnel_setup(struct net_device *dev) { + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + dev->netdev_ops = &ipip6_netdev_ops; - dev->destructor = ipip6_dev_free; + dev->header_ops = &ip_tunnel_header_ops; + dev->needs_free_netdev = true; + dev->priv_destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); + dev->mtu = ETH_DATA_LEN - t_hlen; + dev->min_mtu = IPV6_MIN_MTU; + dev->max_mtu = IP6_MAX_MTU - t_hlen; dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; - dev->iflink = 0; + netif_keep_dst(dev); dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; + dev->features |= SIT_FEATURES; + dev->hw_features |= SIT_FEATURES; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + } static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + int err; tunnel->dev = dev; - tunnel->net = dev_net(dev); - - memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); - memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) + return err; + + netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; } -static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) +static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); - iph->version = 4; iph->protocol = IPPROTO_IPV6; iph->ihl = 5; iph->ttl = 64; - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; - dev_hold(dev); rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); - return 0; } -static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[]) +static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { u8 proto; @@ -1307,16 +1479,15 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); - if (proto != IPPROTO_IPV6 && - proto != IPPROTO_IPIP && - proto != 0) + if (!ipip6_valid_ip_proto(proto)) return -EINVAL; return 0; } static void ipip6_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms, + __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -1328,33 +1499,10 @@ static void ipip6_netlink_parms(struct nlattr *data[], if (!data) return; - if (data[IFLA_IPTUN_LINK]) - parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); - - if (data[IFLA_IPTUN_LOCAL]) - parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); - - if (data[IFLA_IPTUN_REMOTE]) - parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); - - if (data[IFLA_IPTUN_TTL]) { - parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); - if (parms->iph.ttl) - parms->iph.frag_off = htons(IP_DF); - } - - if (data[IFLA_IPTUN_TOS]) - parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); - - if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) - parms->iph.frag_off = htons(IP_DF); - - if (data[IFLA_IPTUN_FLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); - - if (data[IFLA_IPTUN_PROTO]) - parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); + ip_tunnel_netlink_parms(data, parms); + if (data[IFLA_IPTUN_FWMARK]) + *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } #ifdef CONFIG_IPV6_SIT_6RD @@ -1370,8 +1518,7 @@ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], if (data[IFLA_IPTUN_6RD_PREFIX]) { ret = true; - nla_memcpy(&ip6rd->prefix, data[IFLA_IPTUN_6RD_PREFIX], - sizeof(struct in6_addr)); + ip6rd->prefix = nla_get_in6_addr(data[IFLA_IPTUN_6RD_PREFIX]); } if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) { @@ -1395,18 +1542,31 @@ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], } #endif -static int ipip6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +static int ipip6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, + struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel *nt; + struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif + struct net *net; int err; + net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); - ipip6_netlink_parms(data, &nt->parms); + nt->net = net; + + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { + err = ip_tunnel_encap_setup(nt, &ipencap); + if (err < 0) + return err; + } + + ipip6_netlink_parms(data, &nt->parms, &nt->fwmark); if (ipip6_tunnel_locate(net, &nt->parms, 0)) return -EEXIST; @@ -1415,29 +1575,50 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; + if (tb[IFLA_MTU]) { + u32 mtu = nla_get_u32(tb[IFLA_MTU]); + + if (mtu >= IPV6_MIN_MTU && + mtu <= IP6_MAX_MTU - dev->hard_header_len) + dev->mtu = mtu; + } + #ifdef CONFIG_IPV6_SIT_6RD - if (ipip6_netlink_6rd_parms(data, &ip6rd)) + if (ipip6_netlink_6rd_parms(data, &ip6rd)) { err = ipip6_tunnel_update_6rd(nt, &ip6rd); + if (err < 0) + unregister_netdevice_queue(dev, NULL); + } #endif return err; } static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) + struct nlattr *data[], + struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif + __u32 fwmark = t->fwmark; + int err; if (dev == sitn->fb_tunnel_dev) return -EINVAL; - ipip6_netlink_parms(data, &p); + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { + err = ip_tunnel_encap_setup(t, &ipencap); + if (err < 0) + return err; + } + + ipip6_netlink_parms(data, &p, &fwmark); if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) @@ -1451,7 +1632,7 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], } else t = netdev_priv(dev); - ipip6_tunnel_update(t, &p); + ipip6_tunnel_update(t, &p, fwmark); #ifdef CONFIG_IPV6_SIT_6RD if (ipip6_netlink_6rd_parms(data, &ip6rd)) @@ -1490,30 +1671,42 @@ static size_t ipip6_get_size(const struct net_device *dev) /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */ nla_total_size(2) + #endif + /* IFLA_IPTUN_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_DPORT */ + nla_total_size(2) + + /* IFLA_IPTUN_FWMARK */ + nla_total_size(4) + 0; } static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || - nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || - nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || + nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || + nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || - nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags)) + nla_put_be16(skb, IFLA_IPTUN_FLAGS, + ip_tunnel_flags_to_be16(parm->i_flags)) || + nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; #ifdef CONFIG_IPV6_SIT_6RD - if (nla_put(skb, IFLA_IPTUN_6RD_PREFIX, sizeof(struct in6_addr), - &tunnel->ip6rd.prefix) || - nla_put_be32(skb, IFLA_IPTUN_6RD_RELAY_PREFIX, - tunnel->ip6rd.relay_prefix) || + if (nla_put_in6_addr(skb, IFLA_IPTUN_6RD_PREFIX, + &tunnel->ip6rd.prefix) || + nla_put_in_addr(skb, IFLA_IPTUN_6RD_RELAY_PREFIX, + tunnel->ip6rd.relay_prefix) || nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN, tunnel->ip6rd.prefixlen) || nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, @@ -1521,6 +1714,16 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; #endif + if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, + tunnel->encap.type) || + nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, + tunnel->encap.sport) || + nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, + tunnel->encap.dport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, + tunnel->encap.flags)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1542,8 +1745,22 @@ static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 }, [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 }, #endif + [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; +static void ipip6_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); + + if (dev != sitn->fb_tunnel_dev) + unregister_netdevice_queue(dev, head); +} + static struct rtnl_link_ops sit_link_ops __read_mostly = { .kind = "sit", .maxtype = IFLA_IPTUN_MAX, @@ -1555,6 +1772,8 @@ static struct rtnl_link_ops sit_link_ops __read_mostly = { .changelink = ipip6_changelink, .get_size = ipip6_get_size, .fill_info = ipip6_fill_info, + .dellink = ipip6_dellink, + .get_link_net = ip_tunnel_get_link_net, }; static struct xfrm_tunnel sit_handler __read_mostly = { @@ -1569,9 +1788,17 @@ static struct xfrm_tunnel ipip_handler __read_mostly = { .priority = 2, }; -static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_head *head) +#if IS_ENABLED(CONFIG_MPLS) +static struct xfrm_tunnel mplsip_handler __read_mostly = { + .handler = mplsip_rcv, + .err_handler = ipip6_err, + .priority = 2, +}; +#endif + +static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head) { - struct net *net = dev_net(sitn->fb_tunnel_dev); + struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; int prio; @@ -1579,20 +1806,20 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea if (dev->rtnl_link_ops == &sit_link_ops) unregister_netdevice_queue(dev, head); - for (prio = 1; prio < 4; prio++) { + for (prio = 0; prio < 4; prio++) { int h; - for (h = 0; h < HASH_SIZE; h++) { + for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; - t = rtnl_dereference(sitn->tunnels[prio][h]); - while (t != NULL) { + t = rtnl_net_dereference(net, sitn->tunnels[prio][h]); + while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ - if (dev_net(t->dev) != net) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1609,55 +1836,45 @@ static int __net_init sit_init_net(struct net *net) sitn->tunnels[2] = sitn->tunnels_r; sitn->tunnels[3] = sitn->tunnels_r_l; + if (!net_has_fallback_tunnels(net)) + return 0; + sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", + NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!sitn->fb_tunnel_dev) { err = -ENOMEM; goto err_alloc_dev; } dev_net_set(sitn->fb_tunnel_dev, net); + sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; - - err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); - if (err) - goto err_dev_free; + sitn->fb_tunnel_dev->netns_immutable = true; - ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); + t = netdev_priv(sitn->fb_tunnel_dev); + t->net = net; - if ((err = register_netdev(sitn->fb_tunnel_dev))) + err = register_netdev(sitn->fb_tunnel_dev); + if (err) goto err_reg_dev; - t = netdev_priv(sitn->fb_tunnel_dev); + ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); + ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; err_reg_dev: - dev_put(sitn->fb_tunnel_dev); -err_dev_free: - ipip6_dev_free(sitn->fb_tunnel_dev); + free_netdev(sitn->fb_tunnel_dev); err_alloc_dev: return err; } -static void __net_exit sit_exit_net(struct net *net) -{ - struct sit_net *sitn = net_generic(net, sit_net_id); - LIST_HEAD(list); - - rtnl_lock(); - sit_destroy_tunnels(sitn, &list); - unregister_netdevice_queue(sitn->fb_tunnel_dev, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); -} - static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit = sit_exit_net, + .exit_rtnl = sit_exit_rtnl_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; @@ -1667,6 +1884,9 @@ static void __exit sit_cleanup(void) rtnl_link_unregister(&sit_link_ops); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +#if IS_ENABLED(CONFIG_MPLS) + xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); +#endif unregister_pernet_device(&sit_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ @@ -1676,7 +1896,7 @@ static int __init sit_init(void) { int err; - pr_info("IPv6 over IPv4 tunneling driver\n"); + pr_info("IPv6, IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&sit_net_ops); if (err < 0) @@ -1691,6 +1911,13 @@ static int __init sit_init(void) pr_info("%s: can't register ip4ip4\n", __func__); goto xfrm_tunnel4_failed; } +#if IS_ENABLED(CONFIG_MPLS) + err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); + if (err < 0) { + pr_info("%s: can't register mplsip\n", __func__); + goto xfrm_tunnel_mpls_failed; + } +#endif err = rtnl_link_register(&sit_link_ops); if (err < 0) goto rtnl_link_failed; @@ -1699,6 +1926,10 @@ out: return err; rtnl_link_failed: +#if IS_ENABLED(CONFIG_MPLS) + xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); +xfrm_tunnel_mpls_failed: +#endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel4_failed: xfrm4_tunnel_deregister(&sit_handler, AF_INET6); @@ -1709,5 +1940,7 @@ xfrm_tunnel_failed: module_init(sit_init); module_exit(sit_cleanup); +MODULE_DESCRIPTION("IPv6-in-IPv4 tunnel SIT driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0"); |
