From 6ad3122a08e3a9c2148873665752e87cf4f393cc Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 22 Feb 2016 10:40:07 +0100
Subject: flowcache: Avoid OOM condition under preasure

We can hit an OOM condition if we are under presure because
we can not free the entries in gc_list fast enough. So add
a counter for the not yet freed entries in the gc_list and
refuse new allocations if the value is too high.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/core/flow.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/flow.c b/net/core/flow.c
index 1033725be40b..3937b1b68d5b 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -92,8 +92,11 @@ static void flow_cache_gc_task(struct work_struct *work)
 	list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
 	spin_unlock_bh(&xfrm->flow_cache_gc_lock);
 
-	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
 		flow_entry_kill(fce, xfrm);
+		atomic_dec(&xfrm->flow_cache_gc_count);
+		WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0);
+	}
 }
 
 static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
@@ -101,6 +104,7 @@ static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
 				     struct netns_xfrm *xfrm)
 {
 	if (deleted) {
+		atomic_add(deleted, &xfrm->flow_cache_gc_count);
 		fcp->hash_count -= deleted;
 		spin_lock_bh(&xfrm->flow_cache_gc_lock);
 		list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
@@ -232,6 +236,13 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 		if (fcp->hash_count > fc->high_watermark)
 			flow_cache_shrink(fc, fcp);
 
+		if (fcp->hash_count > 2 * fc->high_watermark ||
+		    atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) {
+			atomic_inc(&net->xfrm.flow_cache_genid);
+			flo = ERR_PTR(-ENOBUFS);
+			goto ret_object;
+		}
+
 		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 		if (fle) {
 			fle->net = net;
@@ -446,6 +457,7 @@ int flow_cache_init(struct net *net)
 	INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
 	INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
 	mutex_init(&net->xfrm.flow_flush_sem);
+	atomic_set(&net->xfrm.flow_cache_gc_count, 0);
 
 	fc->hash_shift = 10;
 	fc->low_watermark = 2 * flow_cache_hash_size(fc);
-- 
cgit 


From 215276c0147ef49bc07692ca68bae35a30a64b9a Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 22 Feb 2016 10:56:45 +0100
Subject: xfrm: Reset encapsulation field of the skb before transformation

The inner headers are invalid after a xfrm transformation.
So reset the skb encapsulation field to ensure nobody tries
to access the inner headers.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_output.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index ff4a91fcab9f..637387bbaaea 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -99,6 +99,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
 
 		skb_dst_force(skb);
 
+		/* Inner headers are invalid now. */
+		skb->encapsulation = 0;
+
 		err = x->type->output(x, skb);
 		if (err == -EINPROGRESS)
 			goto out;
-- 
cgit 


From d6af1a31cc72fbd558c7eddbc36f61bf09d1cf6a Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 16 Mar 2016 10:17:37 +0100
Subject: vti: Add pmtu handling to vti_xmit.

We currently rely on the PMTU discovery of xfrm.
However if a packet is locally sent, the PMTU mechanism
of xfrm tries to do local socket notification what
might not work for applications like ping that don't
check for this. So add pmtu handling to vti_xmit to
report MTU changes immediately.

Reported-by: Mark McKinstry <Mark.McKinstry@alliedtelesis.co.nz>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_vti.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 5cf10b777b7e..a917903d5e97 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -156,6 +156,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *tdev;	/* Device to other host */
 	int err;
+	int mtu;
 
 	if (!dst) {
 		dev->stats.tx_carrier_errors++;
@@ -192,6 +193,23 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
 			tunnel->err_count = 0;
 	}
 
+	mtu = dst_mtu(dst);
+	if (skb->len > mtu) {
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+		if (skb->protocol == htons(ETH_P_IP)) {
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+				  htonl(mtu));
+		} else {
+			if (mtu < IPV6_MIN_MTU)
+				mtu = IPV6_MIN_MTU;
+
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		}
+
+		dst_release(dst);
+		goto tx_error;
+	}
+
 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
 	skb_dst_set(skb, dst);
 	skb->dev = skb_dst(skb)->dev;
-- 
cgit 


From b8670c09f37bdf2847cc44f36511a53afc6161fd Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kangjielu@gmail.com>
Date: Tue, 3 May 2016 16:35:05 -0400
Subject: net: fix infoleak in llc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The stack object “info” has a total size of 12 bytes. Its last byte
is padding which is not initialized and leaked via “put_cmsg”.

Signed-off-by: Kangjie Lu <kjlu@gatech.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/af_llc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index b3c52e3f689a..8ae3ed97d95c 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -626,6 +626,7 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb)
 	if (llc->cmsg_flags & LLC_CMSG_PKTINFO) {
 		struct llc_pktinfo info;
 
+		memset(&info, 0, sizeof(info));
 		info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex;
 		llc_pdu_decode_dsap(skb, &info.lpi_sap);
 		llc_pdu_decode_da(skb, info.lpi_mac);
-- 
cgit 


From 5f8e44741f9f216e33736ea4ec65ca9ac03036e6 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kangjielu@gmail.com>
Date: Tue, 3 May 2016 16:46:24 -0400
Subject: net: fix infoleak in rtnetlink
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The stack object “map” has a total size of 32 bytes. Its last 4
bytes are padding generated by compiler. These padding bytes are
not initialized and sent out via “nla_put”.

Signed-off-by: Kangjie Lu <kjlu@gatech.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a75f7e94b445..65763c29f845 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1180,14 +1180,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 
 static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 {
-	struct rtnl_link_ifmap map = {
-		.mem_start   = dev->mem_start,
-		.mem_end     = dev->mem_end,
-		.base_addr   = dev->base_addr,
-		.irq         = dev->irq,
-		.dma         = dev->dma,
-		.port        = dev->if_port,
-	};
+	struct rtnl_link_ifmap map;
+
+	memset(&map, 0, sizeof(map));
+	map.mem_start   = dev->mem_start;
+	map.mem_end     = dev->mem_end;
+	map.base_addr   = dev->base_addr;
+	map.irq         = dev->irq;
+	map.dma         = dev->dma;
+	map.port        = dev->if_port;
+
 	if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
 		return -EMSGSIZE;
 
-- 
cgit 


From dedc58e067d8c379a15a8a183c5db318201295bb Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@docker.com>
Date: Wed, 4 May 2016 14:21:53 +0100
Subject: VSOCK: do not disconnect socket when peer has shutdown SEND only

The peer may be expecting a reply having sent a request and then done a
shutdown(SHUT_WR), so tearing down the whole socket at this point seems
wrong and breaks for me with a client which does a SHUT_WR.

Looking at other socket family's stream_recvmsg callbacks doing a shutdown
here does not seem to be the norm and removing it does not seem to have
had any adverse effects that I can see.

I'm using Stefan's RFC virtio transport patches, I'm unsure of the impact
on the vmci transport.

Signed-off-by: Ian Campbell <ian.campbell@docker.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Cc: Andy King <acking@vmware.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
Cc: Jorgen Hansen <jhansen@vmware.com>
Cc: Adit Ranadive <aditr@vmware.com>
Cc: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/af_vsock.c | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 3dce53ebea92..b5f1221f48d4 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1808,27 +1808,8 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	else if (sk->sk_shutdown & RCV_SHUTDOWN)
 		err = 0;
 
-	if (copied > 0) {
-		/* We only do these additional bookkeeping/notification steps
-		 * if we actually copied something out of the queue pair
-		 * instead of just peeking ahead.
-		 */
-
-		if (!(flags & MSG_PEEK)) {
-			/* If the other side has shutdown for sending and there
-			 * is nothing more to read, then modify the socket
-			 * state.
-			 */
-			if (vsk->peer_shutdown & SEND_SHUTDOWN) {
-				if (vsock_stream_has_data(vsk) <= 0) {
-					sk->sk_state = SS_UNCONNECTED;
-					sock_set_flag(sk, SOCK_DONE);
-					sk->sk_state_change(sk);
-				}
-			}
-		}
+	if (copied > 0)
 		err = copied;
-	}
 
 out:
 	release_sock(sk);
-- 
cgit 


From 31ca0458a61a502adb7ed192bf9716c6d05791a5 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 4 May 2016 16:18:45 +0200
Subject: net: bridge: fix old ioctl unlocked net device walk

get_bridge_ifindices() is used from the old "deviceless" bridge ioctl
calls which aren't called with rtnl held. The comment above says that it is
called with rtnl but that is not really the case.
Here's a sample output from a test ASSERT_RTNL() which I put in
get_bridge_ifindices and executed "brctl show":
[  957.422726] RTNL: assertion failed at net/bridge//br_ioctl.c (30)
[  957.422925] CPU: 0 PID: 1862 Comm: brctl Tainted: G        W  O
4.6.0-rc4+ #157
[  957.423009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.8.1-20150318_183358- 04/01/2014
[  957.423009]  0000000000000000 ffff880058adfdf0 ffffffff8138dec5
0000000000000400
[  957.423009]  ffffffff81ce8380 ffff880058adfe58 ffffffffa05ead32
0000000000000001
[  957.423009]  00007ffec1a444b0 0000000000000400 ffff880053c19130
0000000000008940
[  957.423009] Call Trace:
[  957.423009]  [<ffffffff8138dec5>] dump_stack+0x85/0xc0
[  957.423009]  [<ffffffffa05ead32>]
br_ioctl_deviceless_stub+0x212/0x2e0 [bridge]
[  957.423009]  [<ffffffff81515beb>] sock_ioctl+0x22b/0x290
[  957.423009]  [<ffffffff8126ba75>] do_vfs_ioctl+0x95/0x700
[  957.423009]  [<ffffffff8126c159>] SyS_ioctl+0x79/0x90
[  957.423009]  [<ffffffff8163a4c0>] entry_SYSCALL_64_fastpath+0x23/0xc1

Since it only reads bridge ifindices, we can use rcu to safely walk the net
device list. Also remove the wrong rtnl comment above.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 263b4de4de57..60a3dbfca8a1 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -21,18 +21,19 @@
 #include <asm/uaccess.h>
 #include "br_private.h"
 
-/* called with RTNL */
 static int get_bridge_ifindices(struct net *net, int *indices, int num)
 {
 	struct net_device *dev;
 	int i = 0;
 
-	for_each_netdev(net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
 		if (i >= num)
 			break;
 		if (dev->priv_flags & IFF_EBRIDGE)
 			indices[i++] = dev->ifindex;
 	}
+	rcu_read_unlock();
 
 	return i;
 }
-- 
cgit 


From 856ce5d083e14571d051301fe3c65b32b8cbe321 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Wed, 4 May 2016 17:25:02 +0200
Subject: bridge: fix igmp / mld query parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the newly introduced helper functions the skb pulling is hidden
in the checksumming function - and undone before returning to the
caller.

The IGMP and MLD query parsing functions in the bridge still
assumed that the skb is pointing to the beginning of the IGMP/MLD
message while it is now kept at the beginning of the IPv4/6 header.

If there is a querier somewhere else, then this either causes
the multicast snooping to stay disabled even though it could be
enabled. Or, if we have the querier enabled too, then this can
create unnecessary IGMP / MLD query messages on the link.

Fixing this by taking the offset between IP and IGMP/MLD header into
account, too.

Fixes: 9afd85c9e455 ("net: Export IGMP/MLD message validation code")
Reported-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 191ea66e4d92..6852f3c7009c 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1279,6 +1279,7 @@ static int br_ip4_multicast_query(struct net_bridge *br,
 	struct br_ip saddr;
 	unsigned long max_delay;
 	unsigned long now = jiffies;
+	unsigned int offset = skb_transport_offset(skb);
 	__be32 group;
 	int err = 0;
 
@@ -1289,14 +1290,14 @@ static int br_ip4_multicast_query(struct net_bridge *br,
 
 	group = ih->group;
 
-	if (skb->len == sizeof(*ih)) {
+	if (skb->len == offset + sizeof(*ih)) {
 		max_delay = ih->code * (HZ / IGMP_TIMER_SCALE);
 
 		if (!max_delay) {
 			max_delay = 10 * HZ;
 			group = 0;
 		}
-	} else if (skb->len >= sizeof(*ih3)) {
+	} else if (skb->len >= offset + sizeof(*ih3)) {
 		ih3 = igmpv3_query_hdr(skb);
 		if (ih3->nsrcs)
 			goto out;
@@ -1357,6 +1358,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 	struct br_ip saddr;
 	unsigned long max_delay;
 	unsigned long now = jiffies;
+	unsigned int offset = skb_transport_offset(skb);
 	const struct in6_addr *group = NULL;
 	bool is_general_query;
 	int err = 0;
@@ -1366,8 +1368,8 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 	    (port && port->state == BR_STATE_DISABLED))
 		goto out;
 
-	if (skb->len == sizeof(*mld)) {
-		if (!pskb_may_pull(skb, sizeof(*mld))) {
+	if (skb->len == offset + sizeof(*mld)) {
+		if (!pskb_may_pull(skb, offset + sizeof(*mld))) {
 			err = -EINVAL;
 			goto out;
 		}
@@ -1376,7 +1378,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 		if (max_delay)
 			group = &mld->mld_mca;
 	} else {
-		if (!pskb_may_pull(skb, sizeof(*mld2q))) {
+		if (!pskb_may_pull(skb, offset + sizeof(*mld2q))) {
 			err = -EINVAL;
 			goto out;
 		}
-- 
cgit 


From 1d2f7b2d956e242179aaf4a08f3545f99c81f9a3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 4 May 2016 21:26:08 -0700
Subject: net: ipv6: tcp reset, icmp need to consider L3 domain

Responses for packets to unused ports are getting lost with L3 domains.

IPv4 has ip_send_unicast_reply for sending TCP responses which accounts
for L3 domains; update the IPv6 counterpart tcp_v6_send_response.
For icmp the L3 master check needs to be moved up in icmp6_send
to properly respond to UDP packets to a port with no listener.

Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c     | 5 ++---
 net/ipv6/tcp_ipv6.c | 7 ++++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 0a37ddc7af51..0013cacf7164 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -445,6 +445,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 
 	if (__ipv6_addr_needs_scope_id(addr_type))
 		iif = skb->dev->ifindex;
+	else
+		iif = l3mdev_master_ifindex(skb->dev);
 
 	/*
 	 *	Must not send error if the source does not uniquely
@@ -499,9 +501,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 	else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;
 
-	if (!fl6.flowi6_oif)
-		fl6.flowi6_oif = l3mdev_master_ifindex(skb->dev);
-
 	dst = icmpv6_route_lookup(net, skb, sk, &fl6);
 	if (IS_ERR(dst))
 		goto out;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 711d209f9124..f443c6b0ce16 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -810,8 +810,13 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	fl6.flowi6_proto = IPPROTO_TCP;
 	if (rt6_need_strict(&fl6.daddr) && !oif)
 		fl6.flowi6_oif = tcp_v6_iif(skb);
-	else
+	else {
+		if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
+			oif = skb->skb_iif;
+
 		fl6.flowi6_oif = oif;
+	}
+
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
-- 
cgit 


From 43b8448cd7b42a4c39476c9a12c960c1408f1946 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Tue, 3 May 2016 16:10:20 -0700
Subject: udp_tunnel: Remove redundant udp_tunnel_gro_complete().

The setting of the UDP tunnel GSO type is already performed by
udp[46]_gro_complete().

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index a39068b4a4d9..305d9ac68bd9 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -228,8 +228,6 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff,
 	int err = -ENOSYS;
 	const struct net_offload **offloads;
 
-	udp_tunnel_gro_complete(skb, nhoff);
-
 	rcu_read_lock();
 	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
 	ops = rcu_dereference(offloads[proto]);
-- 
cgit 


From 229740c63169462a838a8b8e16391ed000934631 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Tue, 3 May 2016 16:10:21 -0700
Subject: udp_offload: Set encapsulation before inner completes.

UDP tunnel segmentation code relies on the inner offsets being set for
an UDP tunnel GSO packet, but the inner *_complete() functions will
set the inner offsets only if 'encapsulation' is set before calling
them.  Currently, udp_gro_complete() sets 'encapsulation' only after
the inner *_complete() functions are done.  This causes the inner
offsets having invalid values after udp_gro_complete() returns, which
in turn will make it impossible to properly segment the packet in case
it needs to be forwarded, which would be visible to the user either as
invalid packets being sent or as packet loss.

This patch fixes this by setting skb's 'encapsulation' in
udp_gro_complete() before calling into the inner complete functions,
and by making each possible UDP tunnel gro_complete() callback set the
inner_mac_header to the beginning of the tunnel payload.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Reviewed-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c         | 4 ++++
 net/ipv4/udp_offload.c | 8 +++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 305d9ac68bd9..a6962ccad98a 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -236,6 +236,8 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff,
 
 	err = ops->callbacks.gro_complete(skb, nhoff);
 
+	skb_set_inner_mac_header(skb, nhoff);
+
 out_unlock:
 	rcu_read_unlock();
 
@@ -412,6 +414,8 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff,
 
 	err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
 
+	skb_set_inner_mac_header(skb, nhoff + guehlen);
+
 out_unlock:
 	rcu_read_unlock();
 	return err;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0ed2dafb7cc4..e330c0e56b11 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -399,6 +399,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
 
 	uh->len = newlen;
 
+	/* Set encapsulation before calling into inner gro_complete() functions
+	 * to make them set up the inner offsets.
+	 */
+	skb->encapsulation = 1;
+
 	rcu_read_lock();
 
 	uo_priv = rcu_dereference(udp_offload_base);
@@ -421,9 +426,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
 	if (skb->remcsum_offload)
 		skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;
 
-	skb->encapsulation = 1;
-	skb_set_inner_mac_header(skb, nhoff + sizeof(struct udphdr));
-
 	return err;
 }
 
-- 
cgit