From 254d900b801fc04aa524ff7bafe28fdd1dbf0ed6 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Fri, 14 Jul 2017 12:04:16 +0300
Subject: ipv4: ip_do_fragment: fix headroom tests

Some time ago David Woodhouse reported skb_under_panic
when we try to push ethernet header to fragmented ipv6 skbs.
It was fixed for ipv6 by Florian Westphal in
commit 1d325d217c7f ("ipv6: ip6_fragment: fix headroom tests and skb leak")

However similar problem still exist in ipv4.

It does not trigger skb_under_panic due paranoid check
in ip_finish_output2, however according to Alexey Kuznetsov
current state is abnormal and ip_fragment should be fixed too.

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7eb252dcecee..50c74cd890bc 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -599,6 +599,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	hlen = iph->ihl * 4;
 	mtu = mtu - hlen;	/* Size of data space */
 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
+	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
 
 	/* When frag_list is given, use it. First, check its validity:
 	 * some transformers could create wrong frag_list or break existing
@@ -614,14 +615,15 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		if (first_len - hlen > mtu ||
 		    ((first_len - hlen) & 7) ||
 		    ip_is_fragment(iph) ||
-		    skb_cloned(skb))
+		    skb_cloned(skb) ||
+		    skb_headroom(skb) < ll_rs)
 			goto slow_path;
 
 		skb_walk_frags(skb, frag) {
 			/* Correct geometry. */
 			if (frag->len > mtu ||
 			    ((frag->len & 7) && frag->next) ||
-			    skb_headroom(frag) < hlen)
+			    skb_headroom(frag) < hlen + ll_rs)
 				goto slow_path_clean;
 
 			/* Partially cloned skb? */
@@ -711,8 +713,6 @@ slow_path:
 	left = skb->len - hlen;		/* Space per frame */
 	ptr = hlen;		/* Where to start from */
 
-	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
-
 	/*
 	 *	Fragment the datagram.
 	 */
-- 
cgit 


From 4aea287e90dd61a48268ff2994b56f9799441b62 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 14 Jul 2017 17:49:21 -0400
Subject: tcp_bbr: cut pacing rate only if filled pipe

In bbr_set_pacing_rate(), which decides whether to cut the pacing
rate, there was some code that considered exiting STARTUP to be
equivalent to the notion of filling the pipe (i.e.,
bbr_full_bw_reached()). Specifically, as the code was structured,
exiting STARTUP and going into PROBE_RTT could cause us to cut the
pacing rate down to something silly and low, based on whatever
bandwidth samples we've had so far, when it's possible that all of
them have been small app-limited bandwidth samples that are not
representative of the bandwidth available in the path. (The code was
correct at the time it was written, but the state machine changed
without this spot being adjusted correspondingly.)

Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bbr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index dbcc9352a48f..743e97511dc8 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -220,12 +220,11 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
  */
 static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 {
-	struct bbr *bbr = inet_csk_ca(sk);
 	u64 rate = bw;
 
 	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
-	if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
+	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
 		sk->sk_pacing_rate = rate;
 }
 
-- 
cgit 


From f19fd62dafaf1ed6cf615dba655b82fa9df59074 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 14 Jul 2017 17:49:22 -0400
Subject: tcp_bbr: introduce bbr_bw_to_pacing_rate() helper

Introduce a helper to convert a BBR bandwidth and gain factor to a
pacing rate in bytes per second. This is a pure refactor, but is
needed for two following fixes.

Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bbr.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 743e97511dc8..29e23b851b97 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -211,6 +211,16 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
 	return rate >> BW_SCALE;
 }
 
+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	u64 rate = bw;
+
+	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	return rate;
+}
+
 /* Pace using current bw estimate and a gain factor. In order to help drive the
  * network toward lower queues while maintaining high utilization and low
  * latency, the average pacing rate aims to be slightly (~1%) lower than the
@@ -220,10 +230,8 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
  */
 static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 {
-	u64 rate = bw;
+	u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
 
-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
-	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
 	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
 		sk->sk_pacing_rate = rate;
 }
-- 
cgit 


From 79135b89b8af304456bd67916b80116ddf03d7b6 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 14 Jul 2017 17:49:23 -0400
Subject: tcp_bbr: introduce bbr_init_pacing_rate_from_rtt() helper

Introduce a helper to initialize the BBR pacing rate unconditionally,
based on the current cwnd and RTT estimate. This is a pure refactor,
but is needed for two following fixes.

Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bbr.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 29e23b851b97..3276140c2506 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -221,6 +221,23 @@ static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
 	return rate;
 }
 
+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u64 bw;
+	u32 rtt_us;
+
+	if (tp->srtt_us) {		/* any RTT sample yet? */
+		rtt_us = max(tp->srtt_us >> 3, 1U);
+	} else {			 /* no RTT sample yet */
+		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
+	}
+	bw = (u64)tp->snd_cwnd * BW_UNIT;
+	do_div(bw, rtt_us);
+	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
+}
+
 /* Pace using current bw estimate and a gain factor. In order to help drive the
  * network toward lower queues while maintaining high utilization and low
  * latency, the average pacing rate aims to be slightly (~1%) lower than the
@@ -805,7 +822,6 @@ static void bbr_init(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	u64 bw;
 
 	bbr->prior_cwnd = 0;
 	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
@@ -821,11 +837,8 @@ static void bbr_init(struct sock *sk)
 
 	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
 
-	/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
-	bw = (u64)tp->snd_cwnd * BW_UNIT;
-	do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
 	sk->sk_pacing_rate = 0;		/* force an update of sk_pacing_rate */
-	bbr_set_pacing_rate(sk, bw, bbr_high_gain);
+	bbr_init_pacing_rate_from_rtt(sk);
 
 	bbr->restore_cwnd = 0;
 	bbr->round_start = 0;
-- 
cgit 


From 1d3648eb5d1fe9ed3d095ed8fa19ad11ca4c8bc0 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 14 Jul 2017 17:49:24 -0400
Subject: tcp_bbr: remove sk_pacing_rate=0 transient during init

Fix a corner case noticed by Eric Dumazet, where BBR's setting
sk->sk_pacing_rate to 0 during initialization could theoretically
cause packets in the sending host to hang if there were packets "in
flight" in the pacing infrastructure at the time the BBR congestion
control state is initialized. This could occur if the pacing
infrastructure happened to race with bbr_init() in a way such that the
pacer read the 0 rather than the immediately following non-zero pacing
rate.

Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bbr.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 3276140c2506..42e0017f2ebc 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -837,7 +837,6 @@ static void bbr_init(struct sock *sk)
 
 	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
 
-	sk->sk_pacing_rate = 0;		/* force an update of sk_pacing_rate */
 	bbr_init_pacing_rate_from_rtt(sk);
 
 	bbr->restore_cwnd = 0;
-- 
cgit 


From 32984565574da7ed3afa10647bb4020d7a9e6c93 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 14 Jul 2017 17:49:25 -0400
Subject: tcp_bbr: init pacing rate on first RTT sample

Fixes the following behavior: for connections that had no RTT sample
at the time of initializing congestion control, BBR was initializing
the pacing rate to a high nominal rate (based an a guess of RTT=1ms,
in case this is LAN traffic). Then BBR never adjusted the pacing rate
downward upon obtaining an actual RTT sample, if the connection never
filled the pipe (e.g. all sends were small app-limited writes()).

This fix adjusts the pacing rate upon obtaining the first RTT sample.

Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bbr.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 42e0017f2ebc..69ee877574d0 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -112,7 +112,8 @@ struct bbr {
 		cwnd_gain:10,	/* current gain for setting cwnd */
 		full_bw_cnt:3,	/* number of rounds without large bw gains */
 		cycle_idx:3,	/* current index in pacing_gain cycle array */
-		unused_b:6;
+		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+		unused_b:5;
 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
 };
@@ -225,11 +226,13 @@ static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
 static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
 	u64 bw;
 	u32 rtt_us;
 
 	if (tp->srtt_us) {		/* any RTT sample yet? */
 		rtt_us = max(tp->srtt_us >> 3, 1U);
+		bbr->has_seen_rtt = 1;
 	} else {			 /* no RTT sample yet */
 		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
 	}
@@ -247,8 +250,12 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
  */
 static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
 	u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
 
+	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
+		bbr_init_pacing_rate_from_rtt(sk);
 	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
 		sk->sk_pacing_rate = rate;
 }
@@ -837,6 +844,7 @@ static void bbr_init(struct sock *sk)
 
 	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
 
+	bbr->has_seen_rtt = 0;
 	bbr_init_pacing_rate_from_rtt(sk);
 
 	bbr->restore_cwnd = 0;
-- 
cgit 


From 974292defee033bc43ccfcb2fcefc3eba3905340 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 7 Jul 2017 13:29:03 +0200
Subject: netfilter: nf_tables: only allow in/output for arp packets

arp packets cannot be forwarded.

They can be bridged, but then they can be filtered using
either ebtables or nftables bridge family.

The bridge netfilter exposes a "call-arptables" switch which
pushes packets into arptables, but lets not expose this for nftables, so better
close this asap.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_tables_arp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 805c8ddfe860..4bbc273b45e8 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -72,8 +72,7 @@ static const struct nf_chain_type filter_arp = {
 	.family		= NFPROTO_ARP,
 	.owner		= THIS_MODULE,
 	.hook_mask	= (1 << NF_ARP_IN) |
-			  (1 << NF_ARP_OUT) |
-			  (1 << NF_ARP_FORWARD),
+			  (1 << NF_ARP_OUT),
 };
 
 static int __init nf_tables_arp_init(void)
-- 
cgit 


From 18bcf2907df935981266532e1e0d052aff2e6fae Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Mon, 17 Jul 2017 12:35:58 +0200
Subject: ipv4: ipv6: initialize treq->txhash in cookie_v[46]_check()

KMSAN reported use of uninitialized memory in skb_set_hash_from_sk(),
which originated from the TCP request socket created in
cookie_v6_check():

 ==================================================================
 BUG: KMSAN: use of uninitialized memory in tcp_transmit_skb+0xf77/0x3ec0
 CPU: 1 PID: 2949 Comm: syz-execprog Not tainted 4.11.0-rc5+ #2931
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
 TCP: request_sock_TCPv6: Possible SYN flooding on port 20028. Sending cookies.  Check SNMP counters.
 Call Trace:
  <IRQ>
  __dump_stack lib/dump_stack.c:16
  dump_stack+0x172/0x1c0 lib/dump_stack.c:52
  kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:927
  __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:469
  skb_set_hash_from_sk ./include/net/sock.h:2011
  tcp_transmit_skb+0xf77/0x3ec0 net/ipv4/tcp_output.c:983
  tcp_send_ack+0x75b/0x830 net/ipv4/tcp_output.c:3493
  tcp_delack_timer_handler+0x9a6/0xb90 net/ipv4/tcp_timer.c:284
  tcp_delack_timer+0x1b0/0x310 net/ipv4/tcp_timer.c:309
  call_timer_fn+0x240/0x520 kernel/time/timer.c:1268
  expire_timers kernel/time/timer.c:1307
  __run_timers+0xc13/0xf10 kernel/time/timer.c:1601
  run_timer_softirq+0x36/0xa0 kernel/time/timer.c:1614
  __do_softirq+0x485/0x942 kernel/softirq.c:284
  invoke_softirq kernel/softirq.c:364
  irq_exit+0x1fa/0x230 kernel/softirq.c:405
  exiting_irq+0xe/0x10 ./arch/x86/include/asm/apic.h:657
  smp_apic_timer_interrupt+0x5a/0x80 arch/x86/kernel/apic/apic.c:966
  apic_timer_interrupt+0x86/0x90 arch/x86/entry/entry_64.S:489
 RIP: 0010:native_restore_fl ./arch/x86/include/asm/irqflags.h:36
 RIP: 0010:arch_local_irq_restore ./arch/x86/include/asm/irqflags.h:77
 RIP: 0010:__msan_poison_alloca+0xed/0x120 mm/kmsan/kmsan_instr.c:440
 RSP: 0018:ffff880024917cd8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10
 RAX: 0000000000000246 RBX: ffff8800224c0000 RCX: 0000000000000005
 RDX: 0000000000000004 RSI: ffff880000000000 RDI: ffffea0000b6d770
 RBP: ffff880024917d58 R08: 0000000000000dd8 R09: 0000000000000004
 R10: 0000160000000000 R11: 0000000000000000 R12: ffffffff85abf810
 R13: ffff880024917dd8 R14: 0000000000000010 R15: ffffffff81cabde4
  </IRQ>
  poll_select_copy_remaining+0xac/0x6b0 fs/select.c:293
  SYSC_select+0x4b4/0x4e0 fs/select.c:653
  SyS_select+0x76/0xa0 fs/select.c:634
  entry_SYSCALL_64_fastpath+0x13/0x94 arch/x86/entry/entry_64.S:204
 RIP: 0033:0x4597e7
 RSP: 002b:000000c420037ee0 EFLAGS: 00000246 ORIG_RAX: 0000000000000017
 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004597e7
 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
 RBP: 000000c420037ef0 R08: 000000c420037ee0 R09: 0000000000000059
 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000042dc20
 R13: 00000000000000f3 R14: 0000000000000030 R15: 0000000000000003
 chained origin:
  save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59
  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302
  kmsan_save_stack mm/kmsan/kmsan.c:317
  kmsan_internal_chain_origin+0x12a/0x1f0 mm/kmsan/kmsan.c:547
  __msan_store_shadow_origin_4+0xac/0x110 mm/kmsan/kmsan_instr.c:259
  tcp_create_openreq_child+0x709/0x1ae0 net/ipv4/tcp_minisocks.c:472
  tcp_v6_syn_recv_sock+0x7eb/0x2a30 net/ipv6/tcp_ipv6.c:1103
  tcp_get_cookie_sock+0x136/0x5f0 net/ipv4/syncookies.c:212
  cookie_v6_check+0x17a9/0x1b50 net/ipv6/syncookies.c:245
  tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:989
  tcp_v6_do_rcv+0xdd8/0x1c60 net/ipv6/tcp_ipv6.c:1298
  tcp_v6_rcv+0x41a3/0x4f00 net/ipv6/tcp_ipv6.c:1487
  ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279
  NF_HOOK ./include/linux/netfilter.h:257
  ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322
  dst_input ./include/net/dst.h:492
  ip6_rcv_finish net/ipv6/ip6_input.c:69
  NF_HOOK ./include/linux/netfilter.h:257
  ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203
  __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208
  __netif_receive_skb net/core/dev.c:4246
  process_backlog+0x667/0xba0 net/core/dev.c:4866
  napi_poll net/core/dev.c:5268
  net_rx_action+0xc95/0x1590 net/core/dev.c:5333
  __do_softirq+0x485/0x942 kernel/softirq.c:284
 origin:
  save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59
  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302
  kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:198
  kmsan_kmalloc+0x7f/0xe0 mm/kmsan/kmsan.c:337
  kmem_cache_alloc+0x1c2/0x1e0 mm/slub.c:2766
  reqsk_alloc ./include/net/request_sock.h:87
  inet_reqsk_alloc+0xa4/0x5b0 net/ipv4/tcp_input.c:6200
  cookie_v6_check+0x4f4/0x1b50 net/ipv6/syncookies.c:169
  tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:989
  tcp_v6_do_rcv+0xdd8/0x1c60 net/ipv6/tcp_ipv6.c:1298
  tcp_v6_rcv+0x41a3/0x4f00 net/ipv6/tcp_ipv6.c:1487
  ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279
  NF_HOOK ./include/linux/netfilter.h:257
  ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322
  dst_input ./include/net/dst.h:492
  ip6_rcv_finish net/ipv6/ip6_input.c:69
  NF_HOOK ./include/linux/netfilter.h:257
  ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203
  __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208
  __netif_receive_skb net/core/dev.c:4246
  process_backlog+0x667/0xba0 net/core/dev.c:4866
  napi_poll net/core/dev.c:5268
  net_rx_action+0xc95/0x1590 net/core/dev.c:5333
  __do_softirq+0x485/0x942 kernel/softirq.c:284
 ==================================================================

Similar error is reported for cookie_v4_check().

Fixes: 58d607d3e52f ("tcp: provide skb->hash to synack packets")
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/syncookies.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 0905cf04c2a4..03ad8778c395 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -335,6 +335,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	treq->rcv_isn		= ntohl(th->seq) - 1;
 	treq->snt_isn		= cookie;
 	treq->ts_off		= 0;
+	treq->txhash		= net_tx_rndhash();
 	req->mss		= mss;
 	ireq->ir_num		= ntohs(th->dest);
 	ireq->ir_rmt_port	= th->source;
-- 
cgit 


From 0ddf3fb2c43d2e65aee5de158ed694ea11ef229d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 18 Jul 2017 11:57:55 +0200
Subject: udp: preserve skb->dst if required for IP options processing

Eric noticed that in udp_recvmsg() we still need to access
skb->dst while processing the IP options.
Since commit 0a463c78d25b ("udp: avoid a cache miss on dequeue")
skb->dst is no more available at recvmsg() time and bad things
will happen if we enter the relevant code path.

This commit address the issue, avoid clearing skb->dst if
any IP options are present into the relevant skb.
Since the IP CB is contained in the first skb cacheline, we can
test it to decide to leverage the consume_stateless_skb()
optimization, without measurable additional cost in the faster
path.

v1 -> v2: updated commit message tags

Fixes: 0a463c78d25b ("udp: avoid a cache miss on dequeue")
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 25294d43e147..b057653ceca9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1388,6 +1388,11 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
 		unlock_sock_fast(sk, slow);
 	}
 
+	/* we cleared the head states previously only if the skb lacks any IP
+	 * options, see __udp_queue_rcv_skb().
+	 */
+	if (unlikely(IPCB(skb)->opt.optlen > 0))
+		skb_release_head_state(skb);
 	consume_stateless_skb(skb);
 }
 EXPORT_SYMBOL_GPL(skb_consume_udp);
@@ -1779,8 +1784,12 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		sk_mark_napi_id_once(sk, skb);
 	}
 
-	/* clear all pending head states while they are hot in the cache */
-	skb_release_head_state(skb);
+	/* At recvmsg() time we need skb->dst to process IP options-related
+	 * cmsg, elsewhere can we clear all pending head states while they are
+	 * hot in the cache
+	 */
+	if (likely(IPCB(skb)->opt.optlen == 0))
+		skb_release_head_state(skb);
 
 	rc = __udp_enqueue_schedule_skb(sk, skb);
 	if (rc < 0) {
-- 
cgit 


From 8799a221f5944a7d74516ecf46d58c28ec1d1f75 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Wed, 19 Jul 2017 15:41:33 -0700
Subject: ipv4: initialize fib_trie prior to register_netdev_notifier call.

Net stack initialization currently initializes fib-trie after the
first call to netdevice_notifier() call. In fact fib_trie initialization
needs to happen before first rtnl_register(). It does not cause any problem
since there are no devices UP at this moment, but trying to bring 'lo'
UP at initialization would make this assumption wrong and exposes the issue.

Fixes following crash

 Call Trace:
  ? alternate_node_alloc+0x76/0xa0
  fib_table_insert+0x1b7/0x4b0
  fib_magic.isra.17+0xea/0x120
  fib_add_ifaddr+0x7b/0x190
  fib_netdev_event+0xc0/0x130
  register_netdevice_notifier+0x1c1/0x1d0
  ip_fib_init+0x72/0x85
  ip_rt_init+0x187/0x1e9
  ip_init+0xe/0x1a
  inet_init+0x171/0x26c
  ? ipv4_offload_init+0x66/0x66
  do_one_initcall+0x43/0x160
  kernel_init_freeable+0x191/0x219
  ? rest_init+0x80/0x80
  kernel_init+0xe/0x150
  ret_from_fork+0x22/0x30
 Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 ff 0f 85 7b ff ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 6e 14 4d 63 76 74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08
 RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: ffff9b1500017c28
 CR2: 0000000000000014

Fixes: 7b1a74fdbb9e ("[NETNS]: Refactor fib initialization so it can handle multiple namespaces.")
Fixes: 7f9b80529b8a ("[IPV4]: fib hash|trie initialization")

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4e678fa892dd..044d2a159a3c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1334,13 +1334,14 @@ static struct pernet_operations fib_net_ops = {
 
 void __init ip_fib_init(void)
 {
-	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
-	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
-	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+	fib_trie_init();
 
 	register_pernet_subsys(&fib_net_ops);
+
 	register_netdevice_notifier(&fib_netdev_notifier);
 	register_inetaddr_notifier(&fib_inetaddr_notifier);
 
-	fib_trie_init();
+	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
 }
-- 
cgit